aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commit1ba4a712dde6e6c613fc411a96958b4ade67de4c (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys
parent8fc061164d74a4c9775f39da3c0b5d02112866c8 (diff)
downloadsrc-1ba4a712dde6e6c613fc411a96958b4ade67de4c.tar.gz
src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.zip
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Notes
Notes: svn path=/head/; revision=185029
Diffstat (limited to 'sys')
-rw-r--r--sys/boot/Makefile4
-rw-r--r--sys/boot/common/bootstrap.h1
-rw-r--r--sys/boot/i386/Makefile4
-rw-r--r--sys/boot/i386/libi386/bootinfo32.c1
-rw-r--r--sys/boot/i386/libi386/devicename.c2
-rw-r--r--sys/boot/i386/loader/Makefile10
-rw-r--r--sys/boot/i386/loader/conf.c14
-rw-r--r--sys/boot/i386/loader/main.c51
-rw-r--r--sys/boot/i386/zfsboot/Makefile108
-rw-r--r--sys/boot/i386/zfsboot/zfsboot.c944
-rw-r--r--sys/boot/i386/zfsboot/zfsldr.S402
-rw-r--r--sys/boot/zfs/Makefile29
-rw-r--r--sys/boot/zfs/zfs.c514
-rw-r--r--sys/boot/zfs/zfsimpl.c1443
-rw-r--r--sys/cddl/boot/zfs/README14
-rw-r--r--sys/cddl/boot/zfs/fletcher.c60
-rw-r--r--sys/cddl/boot/zfs/lzjb.c74
-rw-r--r--sys/cddl/boot/zfs/sha256.c127
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h1151
-rw-r--r--sys/cddl/boot/zfs/zfssubr.c193
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c9
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c23
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c112
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_misc.c18
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_policy.c136
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c69
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_zone.c104
-rw-r--r--sys/cddl/compat/opensolaris/sys/atomic.h9
-rw-r--r--sys/cddl/compat/opensolaris/sys/callb.h219
-rw-r--r--sys/cddl/compat/opensolaris/sys/cred.h21
-rw-r--r--sys/cddl/compat/opensolaris/sys/dnlc.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/file.h57
-rw-r--r--sys/cddl/compat/opensolaris/sys/kidmap.h41
-rw-r--r--sys/cddl/compat/opensolaris/sys/kmem.h5
-rw-r--r--sys/cddl/compat/opensolaris/sys/misc.h7
-rw-r--r--sys/cddl/compat/opensolaris/sys/mntent.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/param.h4
-rw-r--r--sys/cddl/compat/opensolaris/sys/pathname.h54
-rw-r--r--sys/cddl/compat/opensolaris/sys/policy.h34
-rw-r--r--sys/cddl/compat/opensolaris/sys/proc.h9
-rw-r--r--sys/cddl/compat/opensolaris/sys/refstr.h34
-rw-r--r--sys/cddl/compat/opensolaris/sys/sid.h54
-rw-r--r--sys/cddl/compat/opensolaris/sys/sig.h69
-rw-r--r--sys/cddl/compat/opensolaris/sys/sunddi.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/sysmacros.h4
-rw-r--r--sys/cddl/compat/opensolaris/sys/time.h3
-rw-r--r--sys/cddl/compat/opensolaris/sys/types.h7
-rw-r--r--sys/cddl/compat/opensolaris/sys/uio.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/vfs.h16
-rw-r--r--sys/cddl/compat/opensolaris/sys/vnode.h125
-rw-r--r--sys/cddl/compat/opensolaris/sys/zone.h6
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.c1598
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.h27
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S12
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S49
-rw-r--r--sys/cddl/contrib/opensolaris/common/avl/avl.c66
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c306
-rw-r--r--sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c2130
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c65
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h44
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c234
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h81
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c84
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h7
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c800
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h91
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c186
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c406
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c397
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c2295
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c53
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c405
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c285
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c597
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c841
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c36
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c403
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2413
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c735
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c496
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c395
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c313
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c929
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c249
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c32
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c3038
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c232
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c121
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c776
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c27
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h131
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h160
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h72
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h144
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h216
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c1206
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c445
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c547
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c155
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c84
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c180
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c178
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c421
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c2499
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c423
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c326
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c716
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c2294
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c456
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c579
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c796
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c1762
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c377
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c2470
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c715
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/callb.c31
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/list.c84
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/taskq.c21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr.c58
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr_array.c15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl.h (renamed from sys/cddl/compat/opensolaris/sys/acl.h)95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h61
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h49
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/callb.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cred.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dkio.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dklabel.h33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h454
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/list.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h31
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/processor.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/synch.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h247
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h35376
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h395
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c12
-rw-r--r--sys/conf/files1
-rw-r--r--sys/kern/kern_jail.c262
-rw-r--r--sys/kern/kern_osd.c301
-rw-r--r--sys/kern/kern_proc.c3
-rw-r--r--sys/kern/kern_thread.c3
-rw-r--r--sys/kern/vfs_lookup.c31
-rw-r--r--sys/kern/vfs_subr.c14
-rw-r--r--sys/modules/zfs/Makefile4
-rw-r--r--sys/sys/conf.h2
-rw-r--r--sys/sys/jail.h9
-rw-r--r--sys/sys/mount.h1
-rw-r--r--sys/sys/namei.h14
-rw-r--r--sys/sys/osd.h89
-rw-r--r--sys/sys/priv.h2
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/vnode.h1
228 files changed, 77058 insertions, 11097 deletions
diff --git a/sys/boot/Makefile b/sys/boot/Makefile
index 1af14577652e..27cb7e34308a 100644
--- a/sys/boot/Makefile
+++ b/sys/boot/Makefile
@@ -26,6 +26,10 @@ SUBDIR+= ofw
SUBDIR+= uboot
.endif
+.if defined(LOADER_ZFS_SUPPORT)
+SUBDIR+= zfs
+.endif
+
# Pick the machine-dependent subdir based on the target architecture.
ADIR= ${MACHINE:S/amd64/i386/:S/sun4v/sparc64/}
.if exists(${.CURDIR}/${ADIR}/.)
diff --git a/sys/boot/common/bootstrap.h b/sys/boot/common/bootstrap.h
index 57982d1e9853..5f0848089f43 100644
--- a/sys/boot/common/bootstrap.h
+++ b/sys/boot/common/bootstrap.h
@@ -43,6 +43,7 @@ struct devdesc
#define DEVT_DISK 1
#define DEVT_NET 2
#define DEVT_CD 3
+#define DEVT_ZFS 4
int d_unit;
};
diff --git a/sys/boot/i386/Makefile b/sys/boot/i386/Makefile
index b89222d85b4d..6af86425e7f7 100644
--- a/sys/boot/i386/Makefile
+++ b/sys/boot/i386/Makefile
@@ -1,7 +1,7 @@
# $FreeBSD$
-SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot kgzldr \
- libi386 libfirewire loader
+SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot zfsboot \
+ kgzldr libi386 libfirewire loader
# special boot programs, 'self-extracting boot2+loader'
SUBDIR+= pxeldr
diff --git a/sys/boot/i386/libi386/bootinfo32.c b/sys/boot/i386/libi386/bootinfo32.c
index 6b517c5d3707..d43442783518 100644
--- a/sys/boot/i386/libi386/bootinfo32.c
+++ b/sys/boot/i386/libi386/bootinfo32.c
@@ -183,6 +183,7 @@ bi_load32(char *args, int *howtop, int *bootdevp, vm_offset_t *bip, vm_offset_t
break;
case DEVT_NET:
+ case DEVT_ZFS:
break;
default:
diff --git a/sys/boot/i386/libi386/devicename.c b/sys/boot/i386/libi386/devicename.c
index e1035aa33d05..79a562b62283 100644
--- a/sys/boot/i386/libi386/devicename.c
+++ b/sys/boot/i386/libi386/devicename.c
@@ -167,6 +167,7 @@ i386_parsedev(struct i386_devdesc **dev, const char *devspec, const char **path)
case DEVT_CD:
case DEVT_NET:
+ case DEVT_ZFS:
unit = 0;
if (*np && (*np != ':')) {
@@ -238,6 +239,7 @@ i386_fmtdev(void *vdev)
break;
case DEVT_NET:
+ case DEVT_ZFS:
sprintf(buf, "%s%d:", dev->d_dev->dv_name, dev->d_unit);
break;
}
diff --git a/sys/boot/i386/loader/Makefile b/sys/boot/i386/loader/Makefile
index df2ccc0f15ad..79aceca277c6 100644
--- a/sys/boot/i386/loader/Makefile
+++ b/sys/boot/i386/loader/Makefile
@@ -17,6 +17,12 @@ CFLAGS+= -DLOADER_FIREWIRE_SUPPORT
LIBFIREWIRE= ${.OBJDIR}/../libfirewire/libfirewire.a
.endif
+# Put LOADER_ZFS_SUPPORT=yes in /etc/make.conf for ZFS support
+.if defined(LOADER_ZFS_SUPPORT)
+CFLAGS+= -DLOADER_ZFS_SUPPORT
+LIBZFS= ${.OBJDIR}/../../zfs/libzfsboot.a
+.endif
+
# Enable PXE TFTP or NFS support, not both.
.if defined(LOADER_TFTP_SUPPORT)
CFLAGS+= -DLOADER_TFTP_SUPPORT
@@ -98,8 +104,8 @@ FILES+= loader.rc
# XXX crt0.o needs to be first for pxeboot(8) to work
OBJS= ${BTXCRT}
-DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} ${LIBSTAND}
-LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} -lstand
+DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} ${LIBSTAND}
+LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} -lstand
.include <bsd.prog.mk>
diff --git a/sys/boot/i386/loader/conf.c b/sys/boot/i386/loader/conf.c
index 245f960ed210..05c9a9e95f3c 100644
--- a/sys/boot/i386/loader/conf.c
+++ b/sys/boot/i386/loader/conf.c
@@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$");
extern struct devsw fwohci;
#endif
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct devsw zfs_dev;
+#endif
+
/* Exported for libstand */
struct devsw *devsw[] = {
&bioscd,
@@ -60,15 +64,25 @@ struct devsw *devsw[] = {
#if defined(LOADER_FIREWIRE_SUPPORT)
&fwohci,
#endif
+#if defined(LOADER_ZFS_SUPPORT)
+ &zfs_dev,
+#endif
NULL
};
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct fs_ops zfs_fsops;
+#endif
+
struct fs_ops *file_system[] = {
&ufs_fsops,
&ext2fs_fsops,
&dosfs_fsops,
&cd9660_fsops,
&splitfs_fsops,
+#if defined(LOADER_ZFS_SUPPORT)
+ &zfs_fsops,
+#endif
#ifdef LOADER_GZIP_SUPPORT
&gzipfs_fsops,
#endif
diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c
index 5b23670d12d0..cac28aef9763 100644
--- a/sys/boot/i386/loader/main.c
+++ b/sys/boot/i386/loader/main.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#define KARGS_FLAGS_CD 0x1
#define KARGS_FLAGS_PXE 0x2
+#define KARGS_FLAGS_ZFS 0x4
/* Arguments passed in from the boot1/boot2 loader */
static struct
@@ -51,8 +52,13 @@ static struct
u_int32_t howto;
u_int32_t bootdev;
u_int32_t bootflags;
- u_int32_t pxeinfo;
- u_int32_t res2;
+ union {
+ struct {
+ u_int32_t pxeinfo;
+ u_int32_t res2;
+ };
+ uint64_t zfspool;
+ };
u_int32_t bootinfo;
} *kargs;
@@ -96,7 +102,7 @@ main(void)
*/
bios_getmem();
-#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT)
+#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) || defined(LOADER_ZFS_SUPPORT)
heap_top = PTOV(memtop_copyin);
memtop_copyin -= 0x300000;
heap_bottom = PTOV(memtop_copyin);
@@ -145,6 +151,14 @@ main(void)
bc_add(initial_bootdev);
}
+ archsw.arch_autoload = i386_autoload;
+ archsw.arch_getdev = i386_getdev;
+ archsw.arch_copyin = i386_copyin;
+ archsw.arch_copyout = i386_copyout;
+ archsw.arch_readin = i386_readin;
+ archsw.arch_isainb = isa_inb;
+ archsw.arch_isaoutb = isa_outb;
+
/*
* March through the device switch probing for things.
*/
@@ -172,14 +186,6 @@ main(void)
bios_getsmap();
- archsw.arch_autoload = i386_autoload;
- archsw.arch_getdev = i386_getdev;
- archsw.arch_copyin = i386_copyin;
- archsw.arch_copyout = i386_copyout;
- archsw.arch_readin = i386_readin;
- archsw.arch_isainb = isa_inb;
- archsw.arch_isaoutb = isa_outb;
-
interact(); /* doesn't return */
/* if we ever get here, it is an error */
@@ -252,6 +258,29 @@ extract_currdev(void)
i386_setcurrdev, env_nounset);
env_setenv("loaddev", EV_VOLATILE, i386_fmtdev(&new_currdev), env_noset,
env_nounset);
+
+#ifdef LOADER_ZFS_SUPPORT
+ /*
+ * If we were started from a ZFS-aware boot2, we can work out
+ * which ZFS pool we are booting from.
+ */
+ if (kargs->bootflags & KARGS_FLAGS_ZFS) {
+ /*
+ * Dig out the pool guid and convert it to a 'unit number'
+ */
+ uint64_t guid;
+ int unit;
+ char devname[32];
+ extern int zfs_guid_to_unit(uint64_t);
+
+ guid = kargs->zfspool;
+ unit = zfs_guid_to_unit(guid);
+ if (unit >= 0) {
+ sprintf(devname, "zfs%d", unit);
+ setenv("currdev", devname, 1);
+ }
+ }
+#endif
}
COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot);
diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile
new file mode 100644
index 000000000000..41f1672c82ef
--- /dev/null
+++ b/sys/boot/i386/zfsboot/Makefile
@@ -0,0 +1,108 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../boot2
+
+FILES= zfsboot
+
+NM?= nm
+
+# A value of 0x80 enables LBA support.
+BOOT_BOOT1_FLAGS?= 0x80
+
+BOOT_COMCONSOLE_PORT?= 0x3f8
+BOOT_COMCONSOLE_SPEED?= 9600
+B2SIOFMT?= 0x3
+
+REL1= 0x700
+ORG1= 0x7c00
+ORG2= 0x2000
+
+CFLAGS= -Os -g \
+ -fno-guess-branch-probability \
+ -fomit-frame-pointer \
+ -fno-unit-at-a-time \
+ -mno-align-long-strings \
+ -mrtd \
+ -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 \
+ -DBOOT2 \
+ -DFLAGS=${BOOT_BOOT1_FLAGS} \
+ -DSIOPRT=${BOOT_COMCONSOLE_PORT} \
+ -DSIOFMT=${B2SIOFMT} \
+ -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \
+ -I${.CURDIR}/../../zfs \
+ -I${.CURDIR}/../../../cddl/boot/zfs \
+ -I${.CURDIR}/../btx/lib -I. \
+ -I${.CURDIR}/../boot2 \
+ -Wall -Waggregate-return -Wbad-function-cast -Wcast-align \
+ -Wmissing-declarations -Wmissing-prototypes -Wnested-externs \
+ -Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \
+ -Winline --param max-inline-insns-single=100
+
+LDFLAGS=-static -N --gc-sections
+
+# Pick up ../Makefile.inc early.
+.include <bsd.init.mk>
+
+CLEANFILES= zfsboot
+
+zfsboot: zfsboot1 zfsboot2
+ cat zfsboot1 zfsboot2 > zfsboot
+
+CLEANFILES+= zfsboot1 zfsldr.out zfsldr.o
+
+zfsboot1: zfsldr.out
+ objcopy -S -O binary zfsldr.out ${.TARGET}
+
+zfsldr.out: zfsldr.o
+ ${LD} ${LDFLAGS} -e start -Ttext ${ORG1} -o ${.TARGET} zfsldr.o
+
+CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \
+ zfsboot.o zfsboot.s zfsboot.s.tmp zfsboot.h sio.o
+
+# We currently allow 32768 bytes for zfsboot - in practice it could be
+# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr.
+#
+BOOT2SIZE= 32768
+
+zfsboot2: zfsboot.ld
+ @set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \
+ echo "$$x bytes available"; test $$x -ge 0
+ dd if=zfsboot.ld of=${.TARGET} obs=${BOOT2SIZE} conv=osync
+
+zfsboot.ld: zfsboot.ldr zfsboot.bin ${BTXKERN}
+ btxld -v -E ${ORG2} -f bin -b ${BTXKERN} -l zfsboot.ldr \
+ -o ${.TARGET} -P 1 zfsboot.bin
+
+zfsboot.ldr:
+ cp /dev/null ${.TARGET}
+
+zfsboot.bin: zfsboot.out
+ objcopy -S -O binary zfsboot.out ${.TARGET}
+
+zfsboot.out: ${BTXCRT} zfsboot.o sio.o
+ ${LD} ${LDFLAGS} -Ttext ${ORG2} -o ${.TARGET} ${.ALLSRC}
+
+zfsboot.o: zfsboot.s
+
+SRCS= zfsboot.c zfsboot.h
+
+zfsboot.s: zfsboot.c zfsboot.h ${.CURDIR}/../../zfs/zfsimpl.c
+ ${CC} ${CFLAGS} -S -o zfsboot.s.tmp ${.CURDIR}/zfsboot.c
+ sed -e '/align/d' -e '/nop/d' < zfsboot.s.tmp > zfsboot.s
+ rm -f zfsboot.s.tmp
+
+zfsboot.h: zfsldr.out
+ ${NM} -t d ${.ALLSRC} | awk '/([0-9])+ T xread/ \
+ { x = $$1 - ORG1; \
+ printf("#define XREADORG %#x\n", REL1 + x) }' \
+ ORG1=`printf "%d" ${ORG1}` \
+ REL1=`printf "%d" ${REL1}` > ${.TARGET}
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend zfsboot.s: machine
+CLEANFILES+= machine
+machine:
+ ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.prog.mk>
diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c
new file mode 100644
index 000000000000..9b0a465cbac3
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsboot.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/diskmbr.h>
+#include <sys/reboot.h>
+#include <sys/queue.h>
+
+#include <machine/bootinfo.h>
+#include <machine/elf.h>
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <a.out.h>
+
+#include <btxv86.h>
+
+#include "zfsboot.h"
+#include "lib.h"
+
+#define IO_KEYBOARD 1
+#define IO_SERIAL 2
+
+#define SECOND 18 /* Circa that many ticks in a second. */
+
+#define RBX_ASKNAME 0x0 /* -a */
+#define RBX_SINGLE 0x1 /* -s */
+/* 0x2 is reserved for log2(RB_NOSYNC). */
+/* 0x3 is reserved for log2(RB_HALT). */
+/* 0x4 is reserved for log2(RB_INITNAME). */
+#define RBX_DFLTROOT 0x5 /* -r */
+#define RBX_KDB 0x6 /* -d */
+/* 0x7 is reserved for log2(RB_RDONLY). */
+/* 0x8 is reserved for log2(RB_DUMP). */
+/* 0x9 is reserved for log2(RB_MINIROOT). */
+#define RBX_CONFIG 0xa /* -c */
+#define RBX_VERBOSE 0xb /* -v */
+#define RBX_SERIAL 0xc /* -h */
+#define RBX_CDROM 0xd /* -C */
+/* 0xe is reserved for log2(RB_POWEROFF). */
+#define RBX_GDB 0xf /* -g */
+#define RBX_MUTE 0x10 /* -m */
+/* 0x11 is reserved for log2(RB_SELFTEST). */
+/* 0x12 is reserved for boot programs. */
+/* 0x13 is reserved for boot programs. */
+#define RBX_PAUSE 0x14 /* -p */
+#define RBX_QUIET 0x15 /* -q */
+#define RBX_NOINTR 0x1c /* -n */
+/* 0x1d is reserved for log2(RB_MULTIPLE) and is just misnamed here. */
+#define RBX_DUAL 0x1d /* -D */
+/* 0x1f is reserved for log2(RB_BOOTINFO). */
+
+/* pass: -a, -s, -r, -d, -c, -v, -h, -C, -g, -m, -p, -D */
+#define RBX_MASK (OPT_SET(RBX_ASKNAME) | OPT_SET(RBX_SINGLE) | \
+ OPT_SET(RBX_DFLTROOT) | OPT_SET(RBX_KDB ) | \
+ OPT_SET(RBX_CONFIG) | OPT_SET(RBX_VERBOSE) | \
+ OPT_SET(RBX_SERIAL) | OPT_SET(RBX_CDROM) | \
+ OPT_SET(RBX_GDB ) | OPT_SET(RBX_MUTE) | \
+ OPT_SET(RBX_PAUSE) | OPT_SET(RBX_DUAL))
+
+/* Hint to loader that we came from ZFS */
+#define KARGS_FLAGS_ZFS 0x4
+
+#define PATH_CONFIG "/boot.config"
+#define PATH_BOOT3 "/boot/loader"
+#define PATH_KERNEL "/boot/kernel/kernel"
+
+#define ARGS 0x900
+#define NOPT 14
+#define NDEV 3
+#define MEM_BASE 0x12
+#define MEM_EXT 0x15
+#define V86_CY(x) ((x) & 1)
+#define V86_ZR(x) ((x) & 0x40)
+
+#define DRV_HARD 0x80
+#define DRV_MASK 0x7f
+
+#define TYPE_AD 0
+#define TYPE_DA 1
+#define TYPE_MAXHARD TYPE_DA
+#define TYPE_FD 2
+
+#define OPT_SET(opt) (1 << (opt))
+#define OPT_CHECK(opt) ((opts) & OPT_SET(opt))
+
+extern uint32_t _end;
+
+static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
+static const unsigned char flags[NOPT] = {
+ RBX_DUAL,
+ RBX_SERIAL,
+ RBX_ASKNAME,
+ RBX_CDROM,
+ RBX_CONFIG,
+ RBX_KDB,
+ RBX_GDB,
+ RBX_MUTE,
+ RBX_NOINTR,
+ RBX_PAUSE,
+ RBX_QUIET,
+ RBX_DFLTROOT,
+ RBX_SINGLE,
+ RBX_VERBOSE
+};
+
+static const char *const dev_nm[NDEV] = {"ad", "da", "fd"};
+static const unsigned char dev_maj[NDEV] = {30, 4, 2};
+
+struct dsk {
+ unsigned drive;
+ unsigned type;
+ unsigned unit;
+ unsigned slice;
+ unsigned part;
+ unsigned start;
+ int init;
+};
+static char cmd[512];
+static char kname[1024];
+static uint32_t opts;
+static int comspeed = SIOSPD;
+static struct bootinfo bootinfo;
+static uint32_t bootdev;
+static uint8_t ioctrl = IO_KEYBOARD;
+
+/* Buffers that must not span a 64k boundary. */
+#define READ_BUF_SIZE 8192
+struct dmadat {
+ char rdbuf[READ_BUF_SIZE]; /* for reading large things */
+ char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */
+};
+static struct dmadat *dmadat;
+
+void exit(int);
+static void load(void);
+static int parse(void);
+static void printf(const char *,...);
+static void putchar(int);
+static uint32_t memsize(void);
+static int drvread(struct dsk *, void *, unsigned, unsigned);
+static int keyhit(unsigned);
+static int xputc(int);
+static int xgetc(int);
+static int getc(int);
+
+static void memcpy(void *, const void *, int);
+static void
+memcpy(void *dst, const void *src, int len)
+{
+ const char *s = src;
+ char *d = dst;
+
+ while (len--)
+ *d++ = *s++;
+}
+
+static void
+strcpy(char *dst, const char *src)
+{
+ while (*src)
+ *dst++ = *src++;
+ *dst++ = 0;
+}
+
+static void
+strcat(char *dst, const char *src)
+{
+ while (*dst)
+ dst++;
+ while (*src)
+ *dst++ = *src++;
+ *dst++ = 0;
+}
+
+static int
+strcmp(const char *s1, const char *s2)
+{
+ for (; *s1 == *s2 && *s1; s1++, s2++);
+ return (unsigned char)*s1 - (unsigned char)*s2;
+}
+
+static const char *
+strchr(const char *s, char ch)
+{
+ for (; *s; s++)
+ if (*s == ch)
+ return s;
+ return 0;
+}
+
+static int
+memcmp(const void *p1, const void *p2, size_t n)
+{
+ const char *s1 = (const char *) p1;
+ const char *s2 = (const char *) p2;
+ for (; n > 0 && *s1 == *s2; s1++, s2++, n--);
+ if (n)
+ return (unsigned char)*s1 - (unsigned char)*s2;
+ else
+ return 0;
+}
+
+static void
+memset(void *p, char val, size_t n)
+{
+ char *s = (char *) p;
+ while (n--)
+ *s++ = val;
+}
+
+static void *
+malloc(size_t n)
+{
+ static char *heap_next;
+ static char *heap_end;
+
+ if (!heap_next) {
+ heap_next = (char *) dmadat + sizeof(*dmadat);
+ heap_end = (char *) (640*1024);
+ }
+
+ char *p = heap_next;
+ if (p + n > heap_end) {
+ printf("malloc failure\n");
+ for (;;)
+ ;
+ return 0;
+ }
+ heap_next += n;
+ return p;
+}
+
+static size_t
+strlen(const char *s)
+{
+ size_t len = 0;
+ while (*s++)
+ len++;
+ return len;
+}
+
+static char *
+strdup(const char *s)
+{
+ char *p = malloc(strlen(s) + 1);
+ strcpy(p, s);
+ return p;
+}
+
+#include "zfsimpl.c"
+
+/*
+ * Read from a dnode (which must be from a ZPL filesystem).
+ */
+static int
+zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
+{
+ const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
+ size_t n;
+ int rc;
+
+ n = size;
+ if (*offp + n > zp->zp_size)
+ n = zp->zp_size - *offp;
+
+ rc = dnode_read(spa, dnode, *offp, start, n);
+ if (rc)
+ return (-1);
+ *offp += n;
+
+ return (n);
+}
+
+/*
+ * Current ZFS pool
+ */
+spa_t *spa;
+
+/*
+ * A wrapper for dskread that doesn't have to worry about whether the
+ * buffer pointer crosses a 64k boundary.
+ */
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
+{
+ char *p;
+ unsigned int lba, nb;
+ struct dsk *dsk = (struct dsk *) priv;
+
+ if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
+ return -1;
+
+ p = buf;
+ lba = off / DEV_BSIZE;
+ while (bytes > 0) {
+ nb = bytes / DEV_BSIZE;
+ if (nb > READ_BUF_SIZE / DEV_BSIZE)
+ nb = READ_BUF_SIZE / DEV_BSIZE;
+ if (drvread(dsk, dmadat->rdbuf, lba, nb))
+ return -1;
+ memcpy(p, dmadat->rdbuf, nb * DEV_BSIZE);
+ p += nb * DEV_BSIZE;
+ lba += nb;
+ bytes -= nb * DEV_BSIZE;
+ }
+
+ return 0;
+}
+
+static int
+xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
+{
+ if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
+ printf("Invalid %s\n", "format");
+ return -1;
+ }
+ return 0;
+}
+
+static inline uint32_t
+memsize(void)
+{
+ v86.addr = MEM_EXT;
+ v86.eax = 0x8800;
+ v86int();
+ return v86.eax;
+}
+
+static inline void
+getstr(void)
+{
+ char *s;
+ int c;
+
+ s = cmd;
+ for (;;) {
+ switch (c = xgetc(0)) {
+ case 0:
+ break;
+ case '\177':
+ case '\b':
+ if (s > cmd) {
+ s--;
+ printf("\b \b");
+ }
+ break;
+ case '\n':
+ case '\r':
+ *s = 0;
+ return;
+ default:
+ if (s - cmd < sizeof(cmd) - 1)
+ *s++ = c;
+ putchar(c);
+ }
+ }
+}
+
+static inline void
+putc(int c)
+{
+ v86.addr = 0x10;
+ v86.eax = 0xe00 | (c & 0xff);
+ v86.ebx = 0x7;
+ v86int();
+}
+
+/*
+ * Try to detect a device supported by the legacy int13 BIOS
+ */
+static int
+int13probe(int drive)
+{
+ v86.ctl = V86_FLAGS;
+ v86.addr = 0x13;
+ v86.eax = 0x800;
+ v86.edx = drive;
+ v86int();
+
+ if (!(v86.efl & 0x1) && /* carry clear */
+ ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */
+ if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */
+ return(0); /* skip device */
+ }
+ return (1);
+ }
+ return(0);
+}
+
+static void
+probe_drive(struct dsk *dsk, spa_t **spap)
+{
+ struct dos_partition *dp;
+ char *sec;
+ unsigned i;
+
+ if (!int13probe(dsk->drive))
+ return;
+
+ /*
+ * If we find a vdev on the whole disk, stop here. Otherwise dig
+ * out the MBR and probe each slice in turn for a vdev.
+ */
+ if (vdev_probe(vdev_read, dsk, spap) == 0)
+ return;
+
+ sec = dmadat->secbuf;
+ dsk->start = 0;
+ if (drvread(dsk, sec, DOSBBSECTOR, 1))
+ return;
+ dp = (void *)(sec + DOSPARTOFF);
+
+ for (i = 0; i < NDOSPART; i++) {
+ if (!dp[i].dp_typ)
+ continue;
+ dsk->start = dp[i].dp_start;
+ if (vdev_probe(vdev_read, dsk, spap) == 0) {
+ /*
+ * We record the first pool we find (we will try to boot
+ * from that one.
+ */
+ spap = 0;
+
+ /*
+ * This slice had a vdev. We need a new dsk structure now
+ * sice the vdev now owns this one.
+ */
+ struct dsk *newdsk;
+ newdsk = malloc(sizeof(struct dsk));
+ *newdsk = *dsk;
+ dsk = newdsk;
+ }
+ }
+}
+
+int
+main(void)
+{
+ int autoboot, i;
+ dnode_phys_t dn;
+ off_t off;
+ struct dsk *dsk;
+
+ dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
+ v86.ctl = V86_FLAGS;
+
+ dsk = malloc(sizeof(struct dsk));
+ dsk->drive = *(uint8_t *)PTOV(ARGS);
+ dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD;
+ dsk->unit = dsk->drive & DRV_MASK;
+ dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
+ dsk->part = 0;
+ dsk->start = 0;
+ dsk->init = 0;
+
+ bootinfo.bi_version = BOOTINFO_VERSION;
+ bootinfo.bi_size = sizeof(bootinfo);
+ bootinfo.bi_basemem = 0; /* XXX will be filled by loader or kernel */
+ bootinfo.bi_extmem = memsize();
+ bootinfo.bi_memsizes_valid++;
+ bootinfo.bi_bios_dev = dsk->drive;
+
+ bootdev = MAKEBOOTDEV(dev_maj[dsk->type],
+ dsk->slice, dsk->unit, dsk->part),
+
+ /* Process configuration file */
+
+ autoboot = 1;
+
+ zfs_init();
+
+ /*
+ * Probe the boot drive first - we will try to boot from whatever
+ * pool we find on that drive.
+ */
+ probe_drive(dsk, &spa);
+
+ /*
+ * Probe the rest of the drives that the bios knows about. This
+ * will find any other available pools and it may fill in missing
+ * vdevs for the boot pool.
+ */
+ for (i = 0; i < 4; i++) {
+ if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
+ continue;
+
+ dsk = malloc(sizeof(struct dsk));
+ dsk->drive = i | DRV_HARD;
+ dsk->type = dsk->drive & TYPE_AD;
+ dsk->unit = i;
+ dsk->slice = 0;
+ dsk->part = 0;
+ dsk->start = 0;
+ dsk->init = 0;
+ probe_drive(dsk, 0);
+ }
+
+ /*
+ * If we didn't find a pool on the boot drive, default to the
+ * first pool we found, if any.
+ */
+ if (!spa) {
+ spa = STAILQ_FIRST(&zfs_pools);
+ if (!spa) {
+ printf("No ZFS pools located, can't boot\n");
+ for (;;)
+ ;
+ }
+ }
+
+ zfs_mount_pool(spa);
+
+ if (zfs_lookup(spa, PATH_CONFIG, &dn) == 0) {
+ off = 0;
+ xfsread(&dn, &off, cmd, sizeof(cmd));
+ }
+
+ if (*cmd) {
+ if (parse())
+ autoboot = 0;
+ if (!OPT_CHECK(RBX_QUIET))
+ printf("%s: %s", PATH_CONFIG, cmd);
+ /* Do not process this command twice */
+ *cmd = 0;
+ }
+
+ /*
+ * Try to exec stage 3 boot loader. If interrupted by a keypress,
+ * or in case of failure, try to load a kernel directly instead.
+ */
+
+ if (autoboot && !*kname) {
+ memcpy(kname, PATH_BOOT3, sizeof(PATH_BOOT3));
+ if (!keyhit(3*SECOND)) {
+ load();
+ memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL));
+ }
+ }
+
+ /* Present the user with the boot2 prompt. */
+
+ for (;;) {
+ if (!autoboot || !OPT_CHECK(RBX_QUIET))
+ printf("\nFreeBSD/i386 boot\n"
+ "Default: %s:%s\n"
+ "boot: ",
+ spa->spa_name, kname);
+ if (ioctrl & IO_SERIAL)
+ sio_flush();
+ if (!autoboot || keyhit(5*SECOND))
+ getstr();
+ else if (!autoboot || !OPT_CHECK(RBX_QUIET))
+ putchar('\n');
+ autoboot = 0;
+ if (parse())
+ putchar('\a');
+ else
+ load();
+ }
+}
+
+/* XXX - Needed for btxld to link the boot2 binary; do not remove. */
+void
+exit(int x)
+{
+}
+
+static void
+load(void)
+{
+ union {
+ struct exec ex;
+ Elf32_Ehdr eh;
+ } hdr;
+ static Elf32_Phdr ep[2];
+ static Elf32_Shdr es[2];
+ caddr_t p;
+ dnode_phys_t dn;
+ off_t off;
+ uint32_t addr, x;
+ int fmt, i, j;
+
+ if (zfs_lookup(spa, kname, &dn)) {
+ return;
+ }
+ off = 0;
+ if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
+ return;
+ if (N_GETMAGIC(hdr.ex) == ZMAGIC)
+ fmt = 0;
+ else if (IS_ELF(hdr.eh))
+ fmt = 1;
+ else {
+ printf("Invalid %s\n", "format");
+ return;
+ }
+ if (fmt == 0) {
+ addr = hdr.ex.a_entry & 0xffffff;
+ p = PTOV(addr);
+ off = PAGE_SIZE;
+ if (xfsread(&dn, &off, p, hdr.ex.a_text))
+ return;
+ p += roundup2(hdr.ex.a_text, PAGE_SIZE);
+ if (xfsread(&dn, &off, p, hdr.ex.a_data))
+ return;
+ p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
+ bootinfo.bi_symtab = VTOP(p);
+ memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
+ p += sizeof(hdr.ex.a_syms);
+ if (hdr.ex.a_syms) {
+ if (xfsread(&dn, &off, p, hdr.ex.a_syms))
+ return;
+ p += hdr.ex.a_syms;
+ if (xfsread(&dn, &off, p, sizeof(int)))
+ return;
+ x = *(uint32_t *)p;
+ p += sizeof(int);
+ x -= sizeof(int);
+ if (xfsread(&dn, &off, p, x))
+ return;
+ p += x;
+ }
+ } else {
+ off = hdr.eh.e_phoff;
+ for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
+ if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
+ return;
+ if (ep[j].p_type == PT_LOAD)
+ j++;
+ }
+ for (i = 0; i < 2; i++) {
+ p = PTOV(ep[i].p_paddr & 0xffffff);
+ off = ep[i].p_offset;
+ if (xfsread(&dn, &off, p, ep[i].p_filesz))
+ return;
+ }
+ p += roundup2(ep[1].p_memsz, PAGE_SIZE);
+ bootinfo.bi_symtab = VTOP(p);
+ if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
+ off = hdr.eh.e_shoff + sizeof(es[0]) *
+ (hdr.eh.e_shstrndx + 1);
+ if (xfsread(&dn, &off, &es, sizeof(es)))
+ return;
+ for (i = 0; i < 2; i++) {
+ memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
+ p += sizeof(es[i].sh_size);
+ off = es[i].sh_offset;
+ if (xfsread(&dn, &off, p, es[i].sh_size))
+ return;
+ p += es[i].sh_size;
+ }
+ }
+ addr = hdr.eh.e_entry & 0xffffff;
+ }
+ bootinfo.bi_esymtab = VTOP(p);
+ bootinfo.bi_kernelname = VTOP(kname);
+ __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
+ bootdev,
+ KARGS_FLAGS_ZFS,
+ (uint32_t) spa->spa_guid,
+ (uint32_t) (spa->spa_guid >> 32),
+ VTOP(&bootinfo));
+}
+
+static int
+parse()
+{
+ char *arg = cmd;
+ char *ep, *p, *q;
+ const char *cp;
+ //unsigned int drv;
+ int c, i, j;
+
+ while ((c = *arg++)) {
+ if (c == ' ' || c == '\t' || c == '\n')
+ continue;
+ for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
+ ep = p;
+ if (*p)
+ *p++ = 0;
+ if (c == '-') {
+ while ((c = *arg++)) {
+ if (c == 'P') {
+ if (*(uint8_t *)PTOV(0x496) & 0x10) {
+ cp = "yes";
+ } else {
+ opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
+ cp = "no";
+ }
+ printf("Keyboard: %s\n", cp);
+ continue;
+ } else if (c == 'S') {
+ j = 0;
+ while ((unsigned int)(i = *arg++ - '0') <= 9)
+ j = j * 10 + i;
+ if (j > 0 && i == -'0') {
+ comspeed = j;
+ break;
+ }
+ /* Fall through to error below ('S' not in optstr[]). */
+ }
+ for (i = 0; c != optstr[i]; i++)
+ if (i == NOPT - 1)
+ return -1;
+ opts ^= OPT_SET(flags[i]);
+ }
+ ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
+ OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
+ if (ioctrl & IO_SERIAL)
+ sio_init(115200 / comspeed);
+ } if (c == '?') {
+ dnode_phys_t dn;
+
+ if (zfs_lookup(spa, arg, &dn) == 0) {
+ zap_list(spa, &dn);
+ }
+ return -1;
+ } else {
+ arg--;
+
+ /*
+ * Report pool status if the comment is 'status'. Lets
+ * hope no-one wants to load /status as a kernel.
+ */
+ if (!strcmp(arg, "status")) {
+ spa_all_status();
+ return -1;
+ }
+
+ /*
+ * If there is a colon, switch pools.
+ */
+ q = (char *) strchr(arg, ':');
+ if (q) {
+ spa_t *newspa;
+
+ *q++ = 0;
+ newspa = spa_find_by_name(arg);
+ if (newspa) {
+ spa = newspa;
+ zfs_mount_pool(spa);
+ } else {
+ printf("\nCan't find ZFS pool %s\n", arg);
+ return -1;
+ }
+ arg = q;
+ }
+ if ((i = ep - arg)) {
+ if ((size_t)i >= sizeof(kname))
+ return -1;
+ memcpy(kname, arg, i + 1);
+ }
+ }
+ arg = p;
+ }
+ return 0;
+}
+
+static void
+printf(const char *fmt,...)
+{
+ va_list ap;
+ char buf[10];
+ char *s;
+ unsigned u;
+ int c;
+ int minus;
+ int prec;
+ int len;
+ int pad;
+
+ va_start(ap, fmt);
+ while ((c = *fmt++)) {
+ if (c == '%') {
+ minus = 0;
+ prec = 0;
+ nextfmt:
+ c = *fmt++;
+ switch (c) {
+ case '-':
+ minus = 1;
+ goto nextfmt;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ prec = 10 * prec + (c - '0');
+ goto nextfmt;
+ case 'c':
+ putchar(va_arg(ap, int));
+ continue;
+ case 's':
+ s = va_arg(ap, char *);
+ if (prec) {
+ len = strlen(s);
+ if (len < prec)
+ pad = prec - len;
+ else
+ pad = 0;
+ if (minus)
+ while (pad--)
+ putchar(' ');
+ for (; *s; s++)
+ putchar(*s);
+ if (!minus)
+ while (pad--)
+ putchar(' ');
+ } else {
+ for (; *s; s++)
+ putchar(*s);
+ }
+ continue;
+ case 'u':
+ u = va_arg(ap, unsigned);
+ s = buf;
+ do
+ *s++ = '0' + u % 10U;
+ while (u /= 10U);
+ while (--s >= buf)
+ putchar(*s);
+ continue;
+ }
+ }
+ putchar(c);
+ }
+ va_end(ap);
+ return;
+}
+
+static void
+putchar(int c)
+{
+ if (c == '\n')
+ xputc('\r');
+ xputc(c);
+}
+
+static int
+drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk)
+{
+ static unsigned c = 0x2d5c7c2f;
+
+ lba += dsk->start;
+ if (!OPT_CHECK(RBX_QUIET))
+ printf("%c\b", c = c << 8 | c >> 24);
+ v86.ctl = V86_ADDR | V86_CALLF | V86_FLAGS;
+ v86.addr = XREADORG; /* call to xread in boot1 */
+ v86.es = VTOPSEG(buf);
+ v86.eax = lba;
+ v86.ebx = VTOPOFF(buf);
+ v86.ecx = lba >> 16;
+ v86.edx = nblk << 8 | dsk->drive;
+ v86int();
+ v86.ctl = V86_FLAGS;
+ if (V86_CY(v86.efl)) {
+ printf("error %u lba %u\n", v86.eax >> 8 & 0xff, lba);
+ return -1;
+ }
+ return 0;
+}
+
+static int
+keyhit(unsigned ticks)
+{
+ uint32_t t0, t1;
+
+ if (OPT_CHECK(RBX_NOINTR))
+ return 0;
+ t0 = 0;
+ for (;;) {
+ if (xgetc(1))
+ return 1;
+ t1 = *(uint32_t *)PTOV(0x46c);
+ if (!t0)
+ t0 = t1;
+ if (t1 < t0 || t1 >= t0 + ticks)
+ return 0;
+ }
+}
+
+static int
+xputc(int c)
+{
+ if (ioctrl & IO_KEYBOARD)
+ putc(c);
+ if (ioctrl & IO_SERIAL)
+ sio_putc(c);
+ return c;
+}
+
+static int
+xgetc(int fn)
+{
+ if (OPT_CHECK(RBX_NOINTR))
+ return 0;
+ for (;;) {
+ if (ioctrl & IO_KEYBOARD && getc(1))
+ return fn ? 1 : getc(0);
+ if (ioctrl & IO_SERIAL && sio_ischar())
+ return fn ? 1 : sio_getc();
+ if (fn)
+ return 0;
+ }
+}
+
+static int
+getc(int fn)
+{
+ /*
+ * The extra comparison against zero is an attempt to work around
+ * what appears to be a bug in QEMU and Bochs. Both emulators
+ * sometimes report a key-press with scancode one and ascii zero
+ * when no such key is pressed in reality. As far as I can tell,
+ * this only happens shortly after a reboot.
+ */
+ v86.addr = 0x16;
+ v86.eax = fn << 8;
+ v86int();
+ return fn == 0 ? v86.eax & 0xff : (!V86_ZR(v86.efl) && (v86.eax & 0xff));
+}
diff --git a/sys/boot/i386/zfsboot/zfsldr.S b/sys/boot/i386/zfsboot/zfsldr.S
new file mode 100644
index 000000000000..a256d30276df
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsldr.S
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ *
+ * $FreeBSD$
+ */
+
+/* Memory Locations */
+ .set MEM_REL,0x700 # Relocation address
+ .set MEM_ARG,0x900 # Arguments
+ .set MEM_ORG,0x7c00 # Origin
+ .set MEM_BUF,0x8000 # Load area
+ .set MEM_BTX,0x9000 # BTX start
+ .set MEM_JMP,0x9010 # BTX entry point
+ .set MEM_USR,0xa000 # Client start
+ .set BDA_BOOT,0x472 # Boot howto flag
+
+/* Partition Constants */
+ .set PRT_OFF,0x1be # Partition offset
+ .set PRT_NUM,0x4 # Partitions
+ .set PRT_BSD,0xa5 # Partition type
+
+/* Flag Bits */
+ .set FL_PACKET,0x80 # Packet mode
+
+/* Misc. Constants */
+ .set SIZ_PAG,0x1000 # Page size
+ .set SIZ_SEC,0x200 # Sector size
+
+ .set NSECT,0x40
+ .globl start
+ .globl xread
+ .code16
+
+start: jmp main # Start recognizably
+
+/*
+ * This is the start of a standard BIOS Parameter Block (BPB). Most bootable
+ * FAT disks have this at the start of their MBR. While normal BIOS's will
+ * work fine without this section, IBM's El Torito emulation "fixes" up the
+ * BPB by writing into the memory copy of the MBR. Rather than have data
+ * written into our xread routine, we'll define a BPB to work around it.
+ * The data marked with (T) indicates a field required for a ThinkPad to
+ * recognize the disk and (W) indicates fields written from IBM BIOS code.
+ * The use of the BPB is based on what OpenBSD and NetBSD implemented in
+ * their boot code but the required fields were determined by trial and error.
+ *
+ * Note: If additional space is needed in boot1, one solution would be to
+ * move the "prompt" message data (below) to replace the OEM ID.
+ */
+ .org 0x03, 0x00
+oemid: .space 0x08, 0x00 # OEM ID
+
+ .org 0x0b, 0x00
+bpb: .word 512 # sector size (T)
+ .byte 0 # sectors/clustor
+ .word 0 # reserved sectors
+ .byte 0 # number of FATs
+ .word 0 # root entries
+ .word 0 # small sectors
+ .byte 0 # media type (W)
+ .word 0 # sectors/fat
+ .word 18 # sectors per track (T)
+ .word 2 # number of heads (T)
+ .long 0 # hidden sectors (W)
+ .long 0 # large sectors
+
+ .org 0x24, 0x00
+ebpb: .byte 0 # BIOS physical drive number (W)
+
+ .org 0x25,0x90
+/*
+ * Trampoline used by boot2 to call read to read data from the disk via
+ * the BIOS. Call with:
+ *
+ * %cx:%ax - long - LBA to read in
+ * %es:(%bx) - caddr_t - buffer to read data into
+ * %dl - byte - drive to read from
+ * %dh - byte - num sectors to read
+ */
+
+xread: push %ss # Address
+ pop %ds # data
+/*
+ * Setup an EDD disk packet and pass it to read
+ */
+xread.1: # Starting
+ pushl $0x0 # absolute
+ push %cx # block
+ push %ax # number
+ push %es # Address of
+ push %bx # transfer buffer
+ xor %ax,%ax # Number of
+ movb %dh,%al # blocks to
+ push %ax # transfer
+ push $0x10 # Size of packet
+ mov %sp,%bp # Packet pointer
+ callw read # Read from disk
+ lea 0x10(%bp),%sp # Clear stack
+ lret # To far caller
+/*
+ * Load the rest of boot2 and BTX up, copy the parts to the right locations,
+ * and start it all up.
+ */
+
+/*
+ * Setup the segment registers to flat addressing (segment 0) and setup the
+ * stack to end just below the start of our code.
+ */
+main: cld # String ops inc
+ xor %cx,%cx # Zero
+ mov %cx,%es # Address
+ mov %cx,%ds # data
+ mov %cx,%ss # Set up
+ mov $start,%sp # stack
+/*
+ * Relocate ourself to MEM_REL. Since %cx == 0, the inc %ch sets
+ * %cx == 0x100.
+ */
+ mov %sp,%si # Source
+ mov $MEM_REL,%di # Destination
+ incb %ch # Word count
+ rep # Copy
+ movsw # code
+/*
+ * If we are on a hard drive, then load the MBR and look for the first
+ * FreeBSD slice. We use the fake partition entry below that points to
+ * the MBR when we call nread. The first pass looks for the first active
+ * FreeBSD slice. The second pass looks for the first non-active FreeBSD
+ * slice if the first one fails.
+ */
+ mov $part4,%si # Partition
+ cmpb $0x80,%dl # Hard drive?
+ jb main.4 # No
+ movb $0x1,%dh # Block count
+ callw nread # Read MBR
+ mov $0x1,%cx # Two passes
+main.1: mov $MEM_BUF+PRT_OFF,%si # Partition table
+ movb $0x1,%dh # Partition
+main.2: cmpb $PRT_BSD,0x4(%si) # Our partition type?
+ jne main.3 # No
+ jcxz main.5 # If second pass
+ testb $0x80,(%si) # Active?
+ jnz main.5 # Yes
+main.3: add $0x10,%si # Next entry
+ incb %dh # Partition
+ cmpb $0x1+PRT_NUM,%dh # In table?
+ jb main.2 # Yes
+ dec %cx # Do two
+ jcxz main.1 # passes
+/*
+ * If we get here, we didn't find any FreeBSD slices at all, so print an
+ * error message and die.
+ */
+ mov $msg_part,%si # Message
+ jmp error # Error
+/*
+ * Floppies use partition 0 of drive 0.
+ */
+main.4: xor %dx,%dx # Partition:drive
+
+/*
+ * Ok, we have a slice and drive in %dx now, so use that to locate and
+ * load boot2. %si references the start of the slice we are looking
+ * for, so go ahead and load up the 64 sectors starting at sector 1024
+ * (i.e. after the two vdev labels). We don't have do anything fancy
+ * here to allow for an extra copy of boot1 and a partition table
+ * (compare to this section of the UFS bootstrap) so we just load it
+ * all at 0x8000. The first part of boot2 is BTX, which wants to run
+ * at 0x9000. The boot2.bin binary starts right after the end of BTX,
+ * so we have to figure out where the start of it is and then move the
+ * binary to 0xc000. After we have moved the client, we relocate BTX
+ * itself to 0x9000 - doing it in this order means that none of the
+ * memcpy regions overlap which would corrupt the copy. Normally, BTX
+ * clients start at MEM_USR, or 0xa000, but when we use btxld to
+ * create boot2, we use an entry point of 0x2000. That entry point is
+ * relative to MEM_USR; thus boot2.bin starts at 0xc000.
+ *
+ * The load area and the target area for the client overlap so we have
+ * to use a decrementing string move. We also play segment register
+ * games with the destination address for the move so that the client
+ * can be larger than 16k (which would overflow the zero segment since
+ * the client starts at 0xc000). Relocating BTX is easy since the load
+ * area and target area do not overlap.
+ */
+main.5: mov %dx,MEM_ARG # Save args
+ movb $NSECT,%dh # Sector count
+ movw $1024,%ax # Offset to boot2
+ callw nread.1 # Read disk
+main.6: mov $MEM_BUF,%si # BTX (before reloc)
+ mov 0xa(%si),%bx # Get BTX length and set
+ mov $NSECT*SIZ_SEC-1,%di # Size of load area (less one)
+ mov %di,%si # End of load
+ add $MEM_BUF,%si # area
+ sub %bx,%di # End of client, 0xc000 rel
+ mov %di,%cx # Size of
+ inc %cx # client
+ mov $(MEM_USR+2*SIZ_PAG)>>4,%dx # Segment
+ mov %dx,%es # addressing 0xc000
+ std # Move with decrement
+ rep # Relocate
+ movsb # client
+ mov %ds,%dx # Back to
+ mov %dx,%es # zero segment
+ mov $MEM_BUF,%si # BTX (before reloc)
+ mov $MEM_BTX,%di # BTX
+ mov %bx,%cx # Get BTX length
+ cld # Increment this time
+ rep # Relocate
+ movsb # BTX
+
+/*
+ * Enable A20 so we can access memory above 1 meg.
+ * Use the zero-valued %cx as a timeout for embedded hardware which do not
+ * have a keyboard controller.
+ */
+seta20: cli # Disable interrupts
+seta20.1: dec %cx # Timeout?
+ jz seta20.3 # Yes
+ inb $0x64,%al # Get status
+ testb $0x2,%al # Busy?
+ jnz seta20.1 # Yes
+ movb $0xd1,%al # Command: Write
+ outb %al,$0x64 # output port
+seta20.2: inb $0x64,%al # Get status
+ testb $0x2,%al # Busy?
+ jnz seta20.2 # Yes
+ movb $0xdf,%al # Enable
+ outb %al,$0x60 # A20
+seta20.3: sti # Enable interrupts
+
+ jmp start+MEM_JMP-MEM_ORG # Start BTX
+
+
+/*
+ * Trampoline used to call read from within boot1.
+ */
+nread: xor %ax,%ax # Sector offset in partition
+nread.1: mov $MEM_BUF,%bx # Transfer buffer
+ add 0x8(%si),%ax # Get
+ mov 0xa(%si),%cx # LBA
+ push %cs # Read from
+ callw xread.1 # disk
+ jnc return # If success, return
+ mov $msg_read,%si # Otherwise, set the error
+ # message and fall through to
+ # the error routine
+/*
+ * Print out the error message pointed to by %ds:(%si) followed
+ * by a prompt, wait for a keypress, and then reboot the machine.
+ */
+error: callw putstr # Display message
+ mov $prompt,%si # Display
+ callw putstr # prompt
+ xorb %ah,%ah # BIOS: Get
+ int $0x16 # keypress
+ movw $0x1234, BDA_BOOT # Do a warm boot
+ ljmp $0xffff,$0x0 # reboot the machine
+/*
+ * Display a null-terminated string using the BIOS output.
+ */
+putstr.0: mov $0x7,%bx # Page:attribute
+ movb $0xe,%ah # BIOS: Display
+ int $0x10 # character
+putstr: lodsb # Get char
+ testb %al,%al # End of string?
+ jne putstr.0 # No
+
+/*
+ * Overused return code. ereturn is used to return an error from the
+ * read function. Since we assume putstr succeeds, we (ab)use the
+ * same code when we return from putstr.
+ */
+ereturn: movb $0x1,%ah # Invalid
+ stc # argument
+return: retw # To caller
+/*
+ * Reads sectors from the disk. If EDD is enabled, then check if it is
+ * installed and use it if it is. If it is not installed or not enabled, then
+ * fall back to using CHS. Since we use a LBA, if we are using CHS, we have to
+ * fetch the drive parameters from the BIOS and divide it out ourselves.
+ * Call with:
+ *
+ * %dl - byte - drive number
+ * stack - 10 bytes - EDD Packet
+ */
+read: testb $FL_PACKET,%cs:MEM_REL+flags-start # LBA support enabled?
+ jz read.1 # No, use CHS
+ cmpb $0x80,%dl # Hard drive?
+ jb read.1 # No, use CHS
+ mov $0x55aa,%bx # Magic
+ push %dx # Save
+ movb $0x41,%ah # BIOS: Check
+ int $0x13 # extensions present
+ pop %dx # Restore
+ jc read.1 # If error, use CHS
+ cmp $0xaa55,%bx # Magic?
+ jne read.1 # No, so use CHS
+ testb $0x1,%cl # Packet interface?
+ jz read.1 # No, so use CHS
+ mov %bp,%si # Disk packet
+ movb $0x42,%ah # BIOS: Extended
+ int $0x13 # read
+ retw # To caller
+#if 0
+read.1: push %dx # Save
+ movb $0x8,%ah # BIOS: Get drive
+ int $0x13 # parameters
+ movb %dh,%ch # Max head number
+ pop %dx # Restore
+ jc return # If error
+ andb $0x3f,%cl # Sectors per track
+ jz ereturn # If zero
+ cli # Disable interrupts
+ mov 0x8(%bp),%eax # Get LBA
+ push %dx # Save
+ movzbl %cl,%ebx # Divide by
+ xor %edx,%edx # sectors
+ div %ebx # per track
+ movb %ch,%bl # Max head number
+ movb %dl,%ch # Sector number
+ inc %bx # Divide by
+ xorb %dl,%dl # number
+ div %ebx # of heads
+ movb %dl,%bh # Head number
+ pop %dx # Restore
+ cmpl $0x3ff,%eax # Cylinder number supportable?
+ sti # Enable interrupts
+ ja ereturn # No, return an error
+ xchgb %al,%ah # Set up cylinder
+ rorb $0x2,%al # number
+ orb %ch,%al # Merge
+ inc %ax # sector
+ xchg %ax,%cx # number
+ movb %bh,%dh # Head number
+ subb %ah,%al # Sectors this track
+ mov 0x2(%bp),%ah # Blocks to read
+ cmpb %ah,%al # To read
+ jb read.2 # this
+#ifdef TRACK_AT_A_TIME
+ movb %ah,%al # track
+#else
+ movb $1,%al # one sector
+#endif
+read.2: mov $0x5,%di # Try count
+read.3: les 0x4(%bp),%bx # Transfer buffer
+ push %ax # Save
+ movb $0x2,%ah # BIOS: Read
+ int $0x13 # from disk
+ pop %bx # Restore
+ jnc read.4 # If success
+ dec %di # Retry?
+ jz read.6 # No
+ xorb %ah,%ah # BIOS: Reset
+ int $0x13 # disk system
+ xchg %bx,%ax # Block count
+ jmp read.3 # Continue
+read.4: movzbw %bl,%ax # Sectors read
+ add %ax,0x8(%bp) # Adjust
+ jnc read.5 # LBA,
+ incw 0xa(%bp) # transfer
+read.5: shlb %bl # buffer
+ add %bl,0x5(%bp) # pointer,
+ sub %al,0x2(%bp) # block count
+ ja read.1 # If not done
+read.6: retw # To caller
+#else
+read.1: mov $msg_chs,%si
+ jmp error
+msg_chs: .asciz "CHS not supported"
+#endif
+
+/* Messages */
+
+msg_read: .asciz "Read"
+msg_part: .asciz "Boot"
+
+prompt: .asciz " error\r\n"
+
+flags: .byte FLAGS # Flags
+
+ .org PRT_OFF,0x90
+
+/* Partition table */
+
+ .fill 0x30,0x1,0x0
+part4: .byte 0x80, 0x00, 0x01, 0x00
+ .byte 0xa5, 0xfe, 0xff, 0xff
+ .byte 0x00, 0x00, 0x00, 0x00
+ .byte 0x50, 0xc3, 0x00, 0x00 # 50000 sectors long, bleh
+
+ .word 0xaa55 # Magic number
diff --git a/sys/boot/zfs/Makefile b/sys/boot/zfs/Makefile
new file mode 100644
index 000000000000..723233ce4ebb
--- /dev/null
+++ b/sys/boot/zfs/Makefile
@@ -0,0 +1,29 @@
+# $FreeBSD$
+
+LIB= zfsboot
+INTERNALLIB=
+
+SRCS+= zfs.c
+
+CFLAGS+= -I${.CURDIR}/../common -I${.CURDIR}/../.. -I.
+CFLAGS+= -I${.CURDIR}/../../../lib/libstand
+CFLAGS+= -I${.CURDIR}/../../cddl/boot/zfs
+
+# XXX need arch-specific bootstrap CFLAGS here
+#
+CFLAGS+= -ffreestanding -mpreferred-stack-boundary=2 \
+ -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3
+
+CFLAGS+= -Wformat -Wall
+
+.if ${MACHINE_ARCH} == "amd64"
+CLEANFILES+= machine
+machine:
+ ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.lib.mk>
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend ${OBJS}: machine
+.endif
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c
new file mode 100644
index 000000000000..cf0bb9c7f656
--- /dev/null
+++ b/sys/boot/zfs/zfs.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Stand-alone file reading package.
+ */
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stand.h>
+#include <bootstrap.h>
+
+#include "zfsimpl.c"
+
+static int zfs_open(const char *path, struct open_file *f);
+static int zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid);
+static int zfs_close(struct open_file *f);
+static int zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
+static off_t zfs_seek(struct open_file *f, off_t offset, int where);
+static int zfs_stat(struct open_file *f, struct stat *sb);
+static int zfs_readdir(struct open_file *f, struct dirent *d);
+
+struct devsw zfs_dev;
+
+struct fs_ops zfs_fsops = {
+ "zfs",
+ zfs_open,
+ zfs_close,
+ zfs_read,
+ zfs_write,
+ zfs_seek,
+ zfs_stat,
+ zfs_readdir
+};
+
+/*
+ * In-core open file.
+ */
+struct file {
+ off_t f_seekp; /* seek pointer */
+ dnode_phys_t f_dnode;
+ uint64_t f_zap_type; /* zap type for readdir */
+ uint64_t f_num_leafs; /* number of fzap leaf blocks */
+ zap_leaf_phys_t *f_zap_leaf; /* zap leaf buffer */
+};
+
+/*
+ * Open a file.
+ */
+static int
+zfs_open(const char *upath, struct open_file *f)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp;
+ int rc;
+
+ if (f->f_dev != &zfs_dev)
+ return (EINVAL);
+
+ rc = zfs_mount_pool(spa);
+ if (rc)
+ return (rc);
+
+ /* allocate file system specific data structure */
+ fp = malloc(sizeof(struct file));
+ bzero(fp, sizeof(struct file));
+ f->f_fsdata = (void *)fp;
+
+ if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+ printf("Unexpected object set type %lld\n",
+ spa->spa_root_objset.os_type);
+ rc = EIO;
+ goto out;
+ }
+
+ rc = zfs_lookup(spa, upath, &fp->f_dnode);
+ if (rc)
+ goto out;
+
+ fp->f_seekp = 0;
+out:
+ if (rc) {
+ f->f_fsdata = NULL;
+ free(fp);
+ }
+ return (rc);
+}
+
+static int
+zfs_close(struct open_file *f)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+
+ dnode_cache_obj = 0;
+ f->f_fsdata = (void *)0;
+ if (fp == (struct file *)0)
+ return (0);
+
+ free(fp);
+ return (0);
+}
+
+/*
+ * Copy a portion of a file into kernel memory.
+ * Cross block boundaries when necessary.
+ */
+static int
+zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp = (struct file *)f->f_fsdata;
+ const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus;
+ size_t n;
+ int rc;
+
+ n = size;
+ if (fp->f_seekp + n > zp->zp_size)
+ n = zp->zp_size - fp->f_seekp;
+
+ rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
+ if (rc)
+ return (rc);
+
+ if (0) {
+ int i;
+ for (i = 0; i < n; i++)
+ putchar(((char*) start)[i]);
+ }
+ fp->f_seekp += n;
+ if (resid)
+ *resid = size - n;
+
+ return (0);
+}
+
+/*
+ * Don't be silly - the bootstrap has no business writing anything.
+ */
+static int
+zfs_write(struct open_file *f, void *start, size_t size, size_t *resid /* out */)
+{
+
+ return (EROFS);
+}
+
+static off_t
+zfs_seek(struct open_file *f, off_t offset, int where)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+ switch (where) {
+ case SEEK_SET:
+ fp->f_seekp = offset;
+ break;
+ case SEEK_CUR:
+ fp->f_seekp += offset;
+ break;
+ case SEEK_END:
+ fp->f_seekp = zp->zp_size - offset;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+ return (fp->f_seekp);
+}
+
+static int
+zfs_stat(struct open_file *f, struct stat *sb)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+ /* only important stuff */
+ sb->st_mode = zp->zp_mode;
+ sb->st_uid = zp->zp_uid;
+ sb->st_gid = zp->zp_gid;
+ sb->st_size = zp->zp_size;
+
+ return (0);
+}
+
+static int
+zfs_readdir(struct open_file *f, struct dirent *d)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+ mzap_ent_phys_t mze;
+ size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ int rc;
+
+ if ((zp->zp_mode >> 12) != 0x4) {
+ return (ENOTDIR);
+ }
+
+ /*
+ * If this is the first read, get the zap type.
+ */
+ if (fp->f_seekp == 0) {
+ rc = dnode_read(spa, &fp->f_dnode,
+ 0, &fp->f_zap_type, sizeof(fp->f_zap_type));
+ if (rc)
+ return (rc);
+
+ if (fp->f_zap_type == ZBT_MICRO) {
+ fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
+ } else {
+ rc = dnode_read(spa, &fp->f_dnode,
+ offsetof(zap_phys_t, zap_num_leafs),
+ &fp->f_num_leafs,
+ sizeof(fp->f_num_leafs));
+ if (rc)
+ return (rc);
+
+ fp->f_seekp = bsize;
+ fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize);
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp,
+ fp->f_zap_leaf,
+ bsize);
+ if (rc)
+ return (rc);
+ }
+ }
+
+ if (fp->f_zap_type == ZBT_MICRO) {
+ mzap_next:
+ if (fp->f_seekp >= bsize)
+ return (ENOENT);
+
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp, &mze, sizeof(mze));
+ fp->f_seekp += sizeof(mze);
+
+ if (!mze.mze_name[0])
+ goto mzap_next;
+
+ d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
+ d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
+ strcpy(d->d_name, mze.mze_name);
+ d->d_namlen = strlen(d->d_name);
+ return (0);
+ } else {
+ zap_leaf_t zl;
+ zap_leaf_chunk_t *zc, *nc;
+ int chunk;
+ size_t namelen;
+ char *p;
+ uint64_t value;
+
+ /*
+ * Initialise this so we can use the ZAP size
+ * calculating macros.
+ */
+ zl.l_bs = ilog2(bsize);
+ zl.l_phys = fp->f_zap_leaf;
+
+ /*
+ * Figure out which chunk we are currently looking at
+ * and consider seeking to the next leaf. We use the
+ * low bits of f_seekp as a simple chunk index.
+ */
+ fzap_next:
+ chunk = fp->f_seekp & (bsize - 1);
+ if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
+ fp->f_seekp = (fp->f_seekp & ~(bsize - 1)) + bsize;
+ chunk = 0;
+
+ /*
+ * Check for EOF and read the new leaf.
+ */
+ if (fp->f_seekp >= bsize * fp->f_num_leafs)
+ return (ENOENT);
+
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp,
+ fp->f_zap_leaf,
+ bsize);
+ if (rc)
+ return (rc);
+ }
+
+ zc = &ZAP_LEAF_CHUNK(&zl, chunk);
+ fp->f_seekp++;
+ if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+ goto fzap_next;
+
+ namelen = zc->l_entry.le_name_length;
+ if (namelen > sizeof(d->d_name))
+ namelen = sizeof(d->d_name);
+
+ /*
+ * Paste the name back together.
+ */
+ nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+ p = d->d_name;
+ while (namelen > 0) {
+ int len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ memcpy(p, nc->l_array.la_array, len);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+ }
+ d->d_name[sizeof(d->d_name) - 1] = 0;
+
+ /*
+ * Assume the first eight bytes of the value are
+ * a uint64_t.
+ */
+ value = fzap_leaf_value(&zl, zc);
+
+ d->d_fileno = ZFS_DIRENT_OBJ(value);
+ d->d_type = ZFS_DIRENT_TYPE(value);
+ d->d_namlen = strlen(d->d_name);
+
+ return (0);
+ }
+}
+
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+ int fd;
+
+ fd = (uintptr_t) priv;
+ lseek(fd, offset, SEEK_SET);
+ if (read(fd, buf, size) == size) {
+ return 0;
+ } else {
+ return (EIO);
+ }
+}
+
+/*
+ * Convert a pool guid to a 'unit number' suitable for use with zfs_dev_open.
+ */
+int
+zfs_guid_to_unit(uint64_t guid)
+{
+ spa_t *spa;
+ int unit;
+
+ unit = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (spa->spa_guid == guid)
+ return unit;
+ unit++;
+ }
+ return (-1);
+}
+
+static int
+zfs_dev_init(void)
+{
+ char devname[512];
+ int unit, slice;
+ int fd;
+
+ /*
+ * Open all the disks we can find and see if we can reconstruct
+ * ZFS pools from them. Bogusly assumes that the disks are named
+ * diskN or diskNsM.
+ */
+ zfs_init();
+ for (unit = 0; unit < 32 /* XXX */; unit++) {
+ sprintf(devname, "disk%d:", unit);
+ fd = open(devname, O_RDONLY);
+ if (fd == -1)
+ continue;
+
+ /*
+ * If we find a vdev, the zfs code will eat the fd, otherwise
+ * we close it.
+ */
+ if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+ close(fd);
+
+ for (slice = 1; slice <= 4; slice++) {
+ sprintf(devname, "disk%ds%d:", unit, slice);
+ fd = open(devname, O_RDONLY);
+ if (fd == -1)
+ continue;
+ if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+ close(fd);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Print information about ZFS pools
+ */
+static void
+zfs_dev_print(int verbose)
+{
+ spa_t *spa;
+ char line[80];
+ int unit;
+
+ if (verbose) {
+ spa_all_status();
+ return;
+ }
+ unit = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ sprintf(line, " zfs%d: %s\n", unit, spa->spa_name);
+ pager_output(line);
+ unit++;
+ }
+}
+
+/*
+ * Attempt to open the pool described by (dev) for use by (f).
+ */
+static int
+zfs_dev_open(struct open_file *f, ...)
+{
+ va_list args;
+ struct devdesc *dev;
+ int unit, i;
+ spa_t *spa;
+
+ va_start(args, f);
+ dev = va_arg(args, struct devdesc*);
+ va_end(args);
+
+ /*
+ * We mostly ignore the stuff that devopen sends us. For now,
+ * use the unit to find a pool - later we will override the
+ * devname parsing so that we can name a pool and a fs within
+ * the pool.
+ */
+ unit = dev->d_unit;
+ free(dev);
+
+ i = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (i == unit)
+ break;
+ i++;
+ }
+ if (!spa) {
+ return (ENXIO);
+ }
+
+ f->f_devdata = spa;
+ return (0);
+}
+
+static int
+zfs_dev_close(struct open_file *f)
+{
+
+ f->f_devdata = NULL;
+ return (0);
+}
+
+static int
+zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
+{
+
+ return (ENOSYS);
+}
+
+struct devsw zfs_dev = {
+ .dv_name = "zfs",
+ .dv_type = DEVT_ZFS,
+ .dv_init = zfs_dev_init,
+ .dv_strategy = zfs_dev_strategy,
+ .dv_open = zfs_dev_open,
+ .dv_close = zfs_dev_close,
+ .dv_ioctl = noioctl,
+ .dv_print = zfs_dev_print,
+ .dv_cleanup = NULL
+};
diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c
new file mode 100644
index 000000000000..5bbc351f8408
--- /dev/null
+++ b/sys/boot/zfs/zfsimpl.c
@@ -0,0 +1,1443 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Stand-alone ZFS file reader.
+ */
+
+#include "zfsimpl.h"
+#include "zfssubr.c"
+
+/*
+ * List of all vdevs, chained through v_alllink.
+ */
+static vdev_list_t zfs_vdevs;
+
+/*
+ * List of all pools, chained through spa_link.
+ */
+static spa_list_t zfs_pools;
+
+static uint64_t zfs_crc64_table[256];
+static char *zfs_decomp_buf;
+static const dnode_phys_t *dnode_cache_obj = 0;
+static uint64_t dnode_cache_bn;
+static char *dnode_cache_buf;
+static char *zap_scratch;
+
+/*
+ * Forward declarations.
+ */
+static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset);
+
+static void
+zfs_init(void)
+{
+ STAILQ_INIT(&zfs_vdevs);
+ STAILQ_INIT(&zfs_pools);
+
+ zfs_decomp_buf = malloc(128*1024);
+ dnode_cache_buf = malloc(128*1024);
+ zap_scratch = malloc(128*1024);
+
+ zfs_init_crc();
+}
+
+static int
+xdr_int(const unsigned char **xdr, int *ip)
+{
+ *ip = ((*xdr)[0] << 24)
+ | ((*xdr)[1] << 16)
+ | ((*xdr)[2] << 8)
+ | ((*xdr)[3] << 0);
+ (*xdr) += 4;
+ return (0);
+}
+
+static int
+xdr_u_int(const unsigned char **xdr, u_int *ip)
+{
+ *ip = ((*xdr)[0] << 24)
+ | ((*xdr)[1] << 16)
+ | ((*xdr)[2] << 8)
+ | ((*xdr)[3] << 0);
+ (*xdr) += 4;
+ return (0);
+}
+
+static int
+xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
+{
+ u_int hi, lo;
+
+ xdr_u_int(xdr, &hi);
+ xdr_u_int(xdr, &lo);
+ *lp = (((uint64_t) hi) << 32) | lo;
+ return (0);
+}
+
+static int
+nvlist_find(const unsigned char *nvlist, const char *name, int type,
+ int* elementsp, void *valuep)
+{
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ int namelen, pairtype, elements;
+ const char *pairname;
+
+ xdr_int(&p, &namelen);
+ pairname = (const char*) p;
+ p += roundup(namelen, 4);
+ xdr_int(&p, &pairtype);
+
+ if (!memcmp(name, pairname, namelen) && type == pairtype) {
+ xdr_int(&p, &elements);
+ if (elementsp)
+ *elementsp = elements;
+ if (type == DATA_TYPE_UINT64) {
+ xdr_uint64_t(&p, (uint64_t *) valuep);
+ return (0);
+ } else if (type == DATA_TYPE_STRING) {
+ int len;
+ xdr_int(&p, &len);
+ (*(const char**) valuep) = (const char*) p;
+ return (0);
+ } else if (type == DATA_TYPE_NVLIST
+ || type == DATA_TYPE_NVLIST_ARRAY) {
+ (*(const unsigned char**) valuep) =
+ (const unsigned char*) p;
+ return (0);
+ } else {
+ return (EIO);
+ }
+ } else {
+ /*
+ * Not the pair we are looking for, skip to the next one.
+ */
+ p = pair + encoded_size;
+ }
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return (EIO);
+}
+
+/*
+ * Return the next nvlist in an nvlist array.
+ */
+static const unsigned char *
+nvlist_next(const unsigned char *nvlist)
+{
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ p = pair + encoded_size;
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return p;
+}
+
+#ifdef TEST
+
+static const unsigned char *
+nvlist_print(const unsigned char *nvlist, unsigned int indent)
+{
+ static const char* typenames[] = {
+ "DATA_TYPE_UNKNOWN",
+ "DATA_TYPE_BOOLEAN",
+ "DATA_TYPE_BYTE",
+ "DATA_TYPE_INT16",
+ "DATA_TYPE_UINT16",
+ "DATA_TYPE_INT32",
+ "DATA_TYPE_UINT32",
+ "DATA_TYPE_INT64",
+ "DATA_TYPE_UINT64",
+ "DATA_TYPE_STRING",
+ "DATA_TYPE_BYTE_ARRAY",
+ "DATA_TYPE_INT16_ARRAY",
+ "DATA_TYPE_UINT16_ARRAY",
+ "DATA_TYPE_INT32_ARRAY",
+ "DATA_TYPE_UINT32_ARRAY",
+ "DATA_TYPE_INT64_ARRAY",
+ "DATA_TYPE_UINT64_ARRAY",
+ "DATA_TYPE_STRING_ARRAY",
+ "DATA_TYPE_HRTIME",
+ "DATA_TYPE_NVLIST",
+ "DATA_TYPE_NVLIST_ARRAY",
+ "DATA_TYPE_BOOLEAN_VALUE",
+ "DATA_TYPE_INT8",
+ "DATA_TYPE_UINT8",
+ "DATA_TYPE_BOOLEAN_ARRAY",
+ "DATA_TYPE_INT8_ARRAY",
+ "DATA_TYPE_UINT8_ARRAY"
+ };
+
+ unsigned int i, j;
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ int namelen, pairtype, elements;
+ const char *pairname;
+
+ xdr_int(&p, &namelen);
+ pairname = (const char*) p;
+ p += roundup(namelen, 4);
+ xdr_int(&p, &pairtype);
+
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ printf("%s %s", typenames[pairtype], pairname);
+
+ xdr_int(&p, &elements);
+ switch (pairtype) {
+ case DATA_TYPE_UINT64: {
+ uint64_t val;
+ xdr_uint64_t(&p, &val);
+ printf(" = 0x%llx\n", val);
+ break;
+ }
+
+ case DATA_TYPE_STRING: {
+ int len;
+ xdr_int(&p, &len);
+ printf(" = \"%s\"\n", p);
+ break;
+ }
+
+ case DATA_TYPE_NVLIST:
+ printf("\n");
+ nvlist_print(p, indent + 1);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ for (j = 0; j < elements; j++) {
+ printf("[%d]\n", j);
+ p = nvlist_print(p, indent + 1);
+ if (j != elements - 1) {
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ printf("%s %s", typenames[pairtype], pairname);
+ }
+ }
+ break;
+
+ default:
+ printf("\n");
+ }
+
+ p = pair + encoded_size;
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return p;
+}
+
+#endif
+
+static int
+vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+ vdev_t *kid;
+ int rc;
+
+ rc = EIO;
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ if (kid->v_state != VDEV_STATE_HEALTHY)
+ continue;
+ rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size);
+ if (!rc)
+ return (0);
+ }
+
+ return (rc);
+}
+
+static vdev_t *
+vdev_find(uint64_t guid)
+{
+ vdev_t *vdev;
+
+ STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
+ if (vdev->v_guid == guid)
+ return (vdev);
+
+ return (0);
+}
+
+static vdev_t *
+vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv)
+{
+ vdev_t *vdev;
+
+ vdev = malloc(sizeof(vdev_t));
+ memset(vdev, 0, sizeof(vdev_t));
+ STAILQ_INIT(&vdev->v_children);
+ vdev->v_guid = guid;
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ vdev->v_read = read;
+ vdev->v_read_priv = read_priv;
+ STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+
+ return (vdev);
+}
+
+static int
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+{
+ int rc;
+ uint64_t guid, id;
+ const char *type;
+ const char *path;
+ vdev_t *vdev, *kid;
+ const unsigned char *kids;
+ int nkids, i;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, 0, &guid)
+ || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
+ DATA_TYPE_UINT64, 0, &id)
+ || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
+ DATA_TYPE_STRING, 0, &type)) {
+ printf("ZFS: can't find vdev details\n");
+ return (ENOENT);
+ }
+
+ /*
+ * Assume that if we've seen this vdev tree before, this one
+ * will be identical.
+ */
+ vdev = vdev_find(guid);
+ if (vdev) {
+ if (vdevp)
+ *vdevp = vdev;
+ return (0);
+ }
+
+ if (strcmp(type, VDEV_TYPE_MIRROR)
+ && strcmp(type, VDEV_TYPE_DISK)) {
+ printf("ZFS: can only boot from disk or mirror vdevs\n");
+ return (EIO);
+ }
+
+ if (!strcmp(type, VDEV_TYPE_MIRROR))
+ vdev = vdev_create(guid, vdev_mirror_read, 0);
+ else
+ vdev = vdev_create(guid, 0, 0);
+
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+ DATA_TYPE_STRING, 0, &path) == 0) {
+ if (strlen(path) > 5
+ && path[0] == '/'
+ && path[1] == 'd'
+ && path[2] == 'e'
+ && path[3] == 'v'
+ && path[4] == '/')
+ path += 5;
+ vdev->v_name = strdup(path);
+ } else {
+ vdev->v_name = strdup(type);
+ }
+ vdev->v_id = id;
+ rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
+ DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
+ /*
+ * Its ok if we don't have any kids.
+ */
+ if (rc == 0) {
+ for (i = 0; i < nkids; i++) {
+ rc = vdev_init_from_nvlist(kids, &kid);
+ if (rc)
+ return (rc);
+ STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+ kids = nvlist_next(kids);
+ }
+ }
+
+ if (vdevp)
+ *vdevp = vdev;
+ return (0);
+}
+
+static void
+vdev_set_state(vdev_t *vdev)
+{
+ vdev_t *kid;
+ int good_kids;
+ int bad_kids;
+
+ /*
+ * We assume that if we have kids, we are a mirror. A mirror
+ * is healthy if all its kids are healthy. Its degraded (but
+ * working) if at least one kid is healty.
+ */
+
+ if (STAILQ_FIRST(&vdev->v_children)) {
+ good_kids = 0;
+ bad_kids = 0;
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ if (kid->v_state == VDEV_STATE_HEALTHY)
+ good_kids++;
+ else
+ bad_kids++;
+ }
+ if (good_kids) {
+ if (!bad_kids && good_kids)
+ vdev->v_state = VDEV_STATE_HEALTHY;
+ else
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ } else {
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ }
+ }
+}
+
+static spa_t *
+spa_find_by_guid(uint64_t guid)
+{
+ spa_t *spa;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+ if (spa->spa_guid == guid)
+ return (spa);
+
+ return (0);
+}
+
+#ifdef BOOT2
+
+static spa_t *
+spa_find_by_name(const char *name)
+{
+ spa_t *spa;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+ if (!strcmp(spa->spa_name, name))
+ return (spa);
+
+ return (0);
+}
+
+#endif
+
+static spa_t *
+spa_create(uint64_t guid)
+{
+ spa_t *spa;
+
+ spa = malloc(sizeof(spa_t));
+ memset(spa, 0, sizeof(spa_t));
+ STAILQ_INIT(&spa->spa_vdevs);
+ spa->spa_guid = guid;
+ STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
+
+ return (spa);
+}
+
+static const char *
+state_name(vdev_state_t state)
+{
+ static const char* names[] = {
+ "UNKNOWN",
+ "CLOSED",
+ "OFFLINE",
+ "CANT_OPEN",
+ "DEGRADED",
+ "ONLINE"
+ };
+ return names[state];
+}
+
+#ifdef BOOT2
+
+#define pager_printf printf
+
+#else
+
+static void
+pager_printf(const char *fmt, ...)
+{
+ char line[80];
+ va_list args;
+
+ va_start(args, fmt);
+ vsprintf(line, fmt, args);
+ va_end(args);
+ pager_output(line);
+}
+
+#endif
+
+#define STATUS_FORMAT " %-16s %-10s\n"
+
+static void
+print_state(int indent, const char *name, vdev_state_t state)
+{
+ int i;
+ char buf[512];
+
+ buf[0] = 0;
+ for (i = 0; i < indent; i++)
+ strcat(buf, " ");
+ strcat(buf, name);
+ pager_printf(STATUS_FORMAT, buf, state_name(state));
+
+}
+
+static void
+vdev_status(vdev_t *vdev, int indent)
+{
+ vdev_t *kid;
+ print_state(indent, vdev->v_name, vdev->v_state);
+
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ vdev_status(kid, indent + 1);
+ }
+}
+
+static void
+spa_status(spa_t *spa)
+{
+ vdev_t *vdev;
+ int good_kids, bad_kids, degraded_kids;
+ vdev_state_t state;
+
+ pager_printf(" pool: %s\n", spa->spa_name);
+ pager_printf("config:\n\n");
+ pager_printf(STATUS_FORMAT, "NAME", "STATE");
+
+ good_kids = 0;
+ degraded_kids = 0;
+ bad_kids = 0;
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ if (vdev->v_state == VDEV_STATE_HEALTHY)
+ good_kids++;
+ else if (vdev->v_state == VDEV_STATE_DEGRADED)
+ degraded_kids++;
+ else
+ bad_kids++;
+ }
+
+ state = VDEV_STATE_CLOSED;
+ if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
+ state = VDEV_STATE_HEALTHY;
+ else if ((good_kids + degraded_kids) > 0)
+ state = VDEV_STATE_DEGRADED;
+
+ print_state(0, spa->spa_name, state);
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ vdev_status(vdev, 1);
+ }
+}
+
+static void
+spa_all_status(void)
+{
+ spa_t *spa;
+ int first = 1;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (!first)
+ pager_printf("\n");
+ first = 0;
+ spa_status(spa);
+ }
+}
+
+static int
+vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
+{
+ vdev_t vtmp;
+ vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
+ spa_t *spa;
+ vdev_t *vdev, *top_vdev, *pool_vdev;
+ off_t off;
+ blkptr_t bp;
+ const unsigned char *nvlist;
+ uint64_t val;
+ uint64_t guid;
+ uint64_t pool_txg, pool_guid;
+ const char *pool_name;
+ const unsigned char *vdevs;
+ int i;
+ char upbuf[1024];
+ const struct uberblock *up;
+
+ /*
+ * Load the vdev label and figure out which
+ * uberblock is most current.
+ */
+ memset(&vtmp, 0, sizeof(vtmp));
+ vtmp.v_read = read;
+ vtmp.v_read_priv = read_priv;
+ off = offsetof(vdev_label_t, vl_vdev_phys);
+ BP_ZERO(&bp);
+ BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
+ BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ if (zio_read_phys(&vtmp, &bp, vdev_label, off))
+ return (EIO);
+
+ if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
+ return (EIO);
+ }
+
+ nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_VERSION,
+ DATA_TYPE_UINT64, 0, &val)) {
+ return (EIO);
+ }
+
+ if (val != ZFS_VERSION) {
+ printf("ZFS: unsupported ZFS version %d\n", (int) val);
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_STATE,
+ DATA_TYPE_UINT64, 0, &val)) {
+ return (EIO);
+ }
+
+ if (val != POOL_STATE_ACTIVE) {
+ /*
+ * Don't print a message here. If we happen to reboot
+ * while where is an exported pool around, we don't
+ * need a cascade of confusing messages during boot.
+ */
+ /*printf("ZFS: pool is not active\n");*/
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_TXG,
+ DATA_TYPE_UINT64, 0, &pool_txg)
+ || nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_GUID,
+ DATA_TYPE_UINT64, 0, &pool_guid)
+ || nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_NAME,
+ DATA_TYPE_STRING, 0, &pool_name)) {
+ printf("ZFS: can't find pool details\n");
+ return (EIO);
+ }
+
+ /*
+ * Create the pool if this is the first time we've seen it.
+ */
+ spa = spa_find_by_guid(pool_guid);
+ if (!spa) {
+ spa = spa_create(pool_guid);
+ spa->spa_name = strdup(pool_name);
+ }
+ if (pool_txg > spa->spa_txg)
+ spa->spa_txg = pool_txg;
+
+ /*
+ * Get the vdev tree and create our in-core copy of it.
+ * If we already have a healthy vdev with this guid, this must
+ * be some kind of alias (overlapping slices, dangerously dedicated
+ * disks etc).
+ */
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, 0, &guid)) {
+ return (EIO);
+ }
+ vdev = vdev_find(guid);
+ if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_VDEV_TREE,
+ DATA_TYPE_NVLIST, 0, &vdevs)) {
+ return (EIO);
+ }
+ vdev_init_from_nvlist(vdevs, &top_vdev);
+
+ /*
+ * Add the toplevel vdev to the pool if its not already there.
+ */
+ STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
+ if (top_vdev == pool_vdev)
+ break;
+ if (!pool_vdev && top_vdev)
+ STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
+
+ /*
+ * We should already have created an incomplete vdev for this
+ * vdev. Find it and initialise it with our read proc.
+ */
+ vdev = vdev_find(guid);
+ if (vdev) {
+ vdev->v_read = read;
+ vdev->v_read_priv = read_priv;
+ vdev->v_state = VDEV_STATE_HEALTHY;
+ } else {
+ printf("ZFS: inconsistent nvlist contents\n");
+ return (EIO);
+ }
+
+ /*
+ * Re-evaluate top-level vdev state.
+ */
+ vdev_set_state(top_vdev);
+
+ /*
+ * Ok, we are happy with the pool so far. Lets find
+ * the best uberblock and then we can actually access
+ * the contents of the pool.
+ */
+ for (i = 0;
+ i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
+ i++) {
+ off = offsetof(vdev_label_t, vl_uberblock);
+ off += i << UBERBLOCK_SHIFT;
+ BP_ZERO(&bp);
+ DVA_SET_OFFSET(&bp.blk_dva[0], off);
+ BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+ BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ if (zio_read_phys(vdev, &bp, upbuf, off))
+ continue;
+
+ up = (const struct uberblock *) upbuf;
+ if (up->ub_magic != UBERBLOCK_MAGIC)
+ continue;
+ if (up->ub_txg < spa->spa_txg)
+ continue;
+ if (up->ub_txg > spa->spa_uberblock.ub_txg) {
+ spa->spa_uberblock = *up;
+ } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
+ if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
+ spa->spa_uberblock = *up;
+ }
+ }
+
+ if (spap)
+ *spap = spa;
+ return (0);
+}
+
+static int
+ilog2(int n)
+{
+ int v;
+
+ for (v = 0; v < 32; v++)
+ if (n == (1 << v))
+ return v;
+ return -1;
+}
+
+static int
+zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset)
+{
+ int cpfunc = BP_GET_COMPRESS(bp);
+ size_t lsize = BP_GET_LSIZE(bp);
+ size_t psize = BP_GET_PSIZE(bp);
+ int rc;
+
+ /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
+ if (cpfunc != ZIO_COMPRESS_OFF) {
+ rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize);
+ if (rc)
+ return (rc);
+ if (zio_checksum_error(bp, zfs_decomp_buf))
+ return (EIO);
+ if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize,
+ buf, lsize))
+ return (EIO);
+ } else {
+ rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize);
+ if (rc)
+ return (rc);
+
+ if (zio_checksum_error(bp, buf))
+ return (EIO);
+ }
+ return (0);
+}
+
+static int
+zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
+{
+ int i;
+
+ for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ const dva_t *dva = &bp->blk_dva[i];
+ vdev_t *vdev;
+ int vdevid;
+ off_t offset;
+
+ if (!dva->dva_word[0] && !dva->dva_word[1])
+ continue;
+
+ vdevid = DVA_GET_VDEV(dva);
+ offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE;
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
+ if (vdev->v_id == vdevid)
+ break;
+ if (!vdev || !vdev->v_read)
+ continue;
+ if (zio_read_phys(vdev, bp, buf, offset))
+ continue;
+
+ return (0);
+ }
+ printf("ZFS: i/o error - all block copies unavailable\n");
+
+ return (EIO);
+}
+
+static int
+dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
+{
+ int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ int nlevels = dnode->dn_nlevels;
+ int i, rc;
+
+ /*
+ * We truncate the offset to 32bits, mainly so that I don't
+ * have to find a copy of __divdi3 to put into the bootstrap.
+ * I don't think the bootstrap needs to access anything bigger
+ * than 2G anyway. Note that block addresses are still 64bit
+ * so it doesn't affect the possible size of the media.
+ * We still use 64bit block numbers so that the bitshifts
+ * work correctly. Note: bsize may not be a power of two here.
+ */
+ while (buflen > 0) {
+ uint64_t bn = ((int) offset) / bsize;
+ int boff = ((int) offset) % bsize;
+ int ibn;
+ const blkptr_t *indbp;
+ blkptr_t bp;
+
+ if (bn > dnode->dn_maxblkid)
+ return (EIO);
+
+ if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
+ goto cached;
+
+ indbp = dnode->dn_blkptr;
+ for (i = 0; i < nlevels; i++) {
+ /*
+ * Copy the bp from the indirect array so that
+ * we can re-use the scratch buffer for multi-level
+ * objects.
+ */
+ ibn = bn >> ((nlevels - i - 1) * ibshift);
+ ibn &= ((1 << ibshift) - 1);
+ bp = indbp[ibn];
+ rc = zio_read(spa, &bp, dnode_cache_buf);
+ if (rc)
+ return (rc);
+ indbp = (const blkptr_t *) dnode_cache_buf;
+ }
+ dnode_cache_obj = dnode;
+ dnode_cache_bn = bn;
+ cached:
+
+ /*
+ * The buffer contains our data block. Copy what we
+ * need from it and loop.
+ */
+ i = bsize - boff;
+ if (i > buflen) i = buflen;
+ memcpy(buf, &dnode_cache_buf[boff], i);
+ buf = ((char*) buf) + i;
+ offset += i;
+ buflen -= i;
+ }
+
+ return (0);
+}
+
+/*
+ * Lookup a value in a microzap directory. Assumes that the zap
+ * scratch buffer contains the directory contents.
+ */
+static int
+mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ const mzap_phys_t *mz;
+ const mzap_ent_phys_t *mze;
+ size_t size;
+ int chunks, i;
+
+ /*
+ * Microzap objects use exactly one block. Read the whole
+ * thing.
+ */
+ size = dnode->dn_datablkszsec * 512;
+
+ mz = (const mzap_phys_t *) zap_scratch;
+ chunks = size / MZAP_ENT_LEN - 1;
+
+ for (i = 0; i < chunks; i++) {
+ mze = &mz->mz_chunk[i];
+ if (!strcmp(mze->mze_name, name)) {
+ *value = mze->mze_value;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Compare a name with a zap leaf entry. Return non-zero if the name
+ * matches.
+ */
+static int
+fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
+{
+ size_t namelen;
+ const zap_leaf_chunk_t *nc;
+ const char *p;
+
+ namelen = zc->l_entry.le_name_length;
+
+ nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
+ p = name;
+ while (namelen > 0) {
+ size_t len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ if (memcmp(p, nc->l_array.la_array, len))
+ return (0);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
+ }
+
+ return 1;
+}
+
+/*
+ * Extract a uint64_t value from a zap leaf entry.
+ */
+static uint64_t
+fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
+{
+ const zap_leaf_chunk_t *vc;
+ int i;
+ uint64_t value;
+ const uint8_t *p;
+
+ vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
+ for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
+ value = (value << 8) | p[i];
+ }
+
+ return value;
+}
+
+/*
+ * Lookup a value in a fatzap directory. Assumes that the zap scratch
+ * buffer contains the directory header.
+ */
+static int
+fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+ fat_zap_t z;
+ uint64_t *ptrtbl;
+ uint64_t hash;
+ int rc;
+
+ if (zh.zap_magic != ZAP_MAGIC)
+ return (EIO);
+
+ z.zap_block_shift = ilog2(bsize);
+ z.zap_phys = (zap_phys_t *) zap_scratch;
+
+ /*
+ * Figure out where the pointer table is and read it in if necessary.
+ */
+ if (zh.zap_ptrtbl.zt_blk) {
+ rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
+ zap_scratch, bsize);
+ if (rc)
+ return (rc);
+ ptrtbl = (uint64_t *) zap_scratch;
+ } else {
+ ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
+ }
+
+ hash = zap_hash(zh.zap_salt, name);
+
+ zap_leaf_t zl;
+ zl.l_bs = z.zap_block_shift;
+
+ off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
+ zap_leaf_chunk_t *zc;
+
+ rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
+ if (rc)
+ return (rc);
+
+ zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+ /*
+ * Make sure this chunk matches our hash.
+ */
+ if (zl.l_phys->l_hdr.lh_prefix_len > 0
+ && zl.l_phys->l_hdr.lh_prefix
+ != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
+ return (ENOENT);
+
+ /*
+ * Hash within the chunk to find our entry.
+ */
+ int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
+ int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
+ h = zl.l_phys->l_hash[h];
+ if (h == 0xffff)
+ return (ENOENT);
+ zc = &ZAP_LEAF_CHUNK(&zl, h);
+ while (zc->l_entry.le_hash != hash) {
+ if (zc->l_entry.le_next == 0xffff) {
+ zc = 0;
+ break;
+ }
+ zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
+ }
+ if (fzap_name_equal(&zl, zc, name)) {
+ *value = fzap_leaf_value(&zl, zc);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Lookup a name in a zap object and return its value as a uint64_t.
+ */
+static int
+zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ int rc;
+ uint64_t zap_type;
+ size_t size = dnode->dn_datablkszsec * 512;
+
+ rc = dnode_read(spa, dnode, 0, zap_scratch, size);
+ if (rc)
+ return (rc);
+
+ zap_type = *(uint64_t *) zap_scratch;
+ if (zap_type == ZBT_MICRO)
+ return mzap_lookup(spa, dnode, name, value);
+ else
+ return fzap_lookup(spa, dnode, name, value);
+}
+
+#ifdef BOOT2
+
+/*
+ * List a microzap directory. Assumes that the zap scratch buffer contains
+ * the directory contents.
+ */
+static int
+mzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ const mzap_phys_t *mz;
+ const mzap_ent_phys_t *mze;
+ size_t size;
+ int chunks, i;
+
+ /*
+ * Microzap objects use exactly one block. Read the whole
+ * thing.
+ */
+ size = dnode->dn_datablkszsec * 512;
+ mz = (const mzap_phys_t *) zap_scratch;
+ chunks = size / MZAP_ENT_LEN - 1;
+
+ for (i = 0; i < chunks; i++) {
+ mze = &mz->mz_chunk[i];
+ if (mze->mze_name[0])
+ //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
+ printf("%s\n", mze->mze_name);
+ }
+
+ return (0);
+}
+
+/*
+ * List a fatzap directory. Assumes that the zap scratch buffer contains
+ * the directory header.
+ */
+static int
+fzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+ fat_zap_t z;
+ int i, j;
+
+ if (zh.zap_magic != ZAP_MAGIC)
+ return (EIO);
+
+ z.zap_block_shift = ilog2(bsize);
+ z.zap_phys = (zap_phys_t *) zap_scratch;
+
+ /*
+ * This assumes that the leaf blocks start at block 1. The
+ * documentation isn't exactly clear on this.
+ */
+ zap_leaf_t zl;
+ zl.l_bs = z.zap_block_shift;
+ for (i = 0; i < zh.zap_num_leafs; i++) {
+ off_t off = (i + 1) << zl.l_bs;
+ char name[256], *p;
+ uint64_t value;
+
+ if (dnode_read(spa, dnode, off, zap_scratch, bsize))
+ return (EIO);
+
+ zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+ for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
+ zap_leaf_chunk_t *zc, *nc;
+ int namelen;
+
+ zc = &ZAP_LEAF_CHUNK(&zl, j);
+ if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+ continue;
+ namelen = zc->l_entry.le_name_length;
+ if (namelen > sizeof(name))
+ namelen = sizeof(name);
+
+ /*
+ * Paste the name back together.
+ */
+ nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+ p = name;
+ while (namelen > 0) {
+ int len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ memcpy(p, nc->l_array.la_array, len);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+ }
+
+ /*
+ * Assume the first eight bytes of the value are
+ * a uint64_t.
+ */
+ value = fzap_leaf_value(&zl, zc);
+
+ printf("%-32s 0x%llx\n", name, value);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * List a zap directory.
+ */
+static int
+zap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ uint64_t zap_type;
+ size_t size = dnode->dn_datablkszsec * 512;
+
+ if (dnode_read(spa, dnode, 0, zap_scratch, size))
+ return (EIO);
+
+ zap_type = *(uint64_t *) zap_scratch;
+ if (zap_type == ZBT_MICRO)
+ return mzap_list(spa, dnode);
+ else
+ return fzap_list(spa, dnode);
+}
+
+#endif
+
+static int
+objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
+{
+ off_t offset;
+
+ offset = objnum * sizeof(dnode_phys_t);
+ return dnode_read(spa, &os->os_meta_dnode, offset,
+ dnode, sizeof(dnode_phys_t));
+}
+
+/*
+ * Find the object set given the object number of its dataset object
+ * and return its details in *objset
+ */
+static int
+zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
+{
+ dnode_phys_t dataset;
+ dsl_dataset_phys_t *ds;
+
+ if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
+ printf("ZFS: can't find dataset %lld\n", objnum);
+ return (EIO);
+ }
+
+ ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
+ if (zio_read(spa, &ds->ds_bp, objset)) {
+ printf("ZFS: can't read object set for dataset %lld\n", objnum);
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Find the object set pointed to by the BOOTFS property or the root
+ * dataset if there is none and return its details in *objset
+ */
+static int
+zfs_mount_root(spa_t *spa, objset_phys_t *objset)
+{
+ dnode_phys_t dir, propdir;
+ uint64_t props, bootfs, root;
+
+ /*
+ * Start with the MOS directory object.
+ */
+ if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
+ printf("ZFS: can't read MOS object directory\n");
+ return (EIO);
+ }
+
+ /*
+ * Lookup the pool_props and see if we can find a bootfs.
+ */
+ if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
+ && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
+ && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0)
+ return zfs_mount_dataset(spa, bootfs, objset);
+
+ /*
+ * Lookup the root dataset directory
+ */
+ if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
+ || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
+ printf("ZFS: can't find root dsl_dir\n");
+ return (EIO);
+ }
+
+ /*
+ * Use the information from the dataset directory's bonus buffer
+ * to find the dataset object and from that the object set itself.
+ */
+ dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
+ return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
+}
+
+static int
+zfs_mount_pool(spa_t *spa)
+{
+ /*
+ * Find the MOS and work our way in from there.
+ */
+ if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
+ printf("ZFS: can't read MOS\n");
+ return (EIO);
+ }
+
+ /*
+ * Find the root object set
+ */
+ if (zfs_mount_root(spa, &spa->spa_root_objset)) {
+ printf("Can't find root filesystem - giving up\n");
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Lookup a file and return its dnode.
+ */
+static int
+zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
+{
+ int rc;
+ uint64_t objnum, rootnum, parentnum;
+ dnode_phys_t dn;
+ const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
+ const char *p, *q;
+ char element[256];
+ char path[1024];
+ int symlinks_followed = 0;
+
+ if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+ printf("ZFS: unexpected object set type %lld\n",
+ spa->spa_root_objset.os_type);
+ return (EIO);
+ }
+
+ /*
+ * Get the root directory dnode.
+ */
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
+ if (rc)
+ return (rc);
+
+ rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
+ if (rc)
+ return (rc);
+
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
+ if (rc)
+ return (rc);
+
+ objnum = rootnum;
+ p = upath;
+ while (p && *p) {
+ while (*p == '/')
+ p++;
+ if (!*p)
+ break;
+ q = strchr(p, '/');
+ if (q) {
+ memcpy(element, p, q - p);
+ element[q - p] = 0;
+ p = q;
+ } else {
+ strcpy(element, p);
+ p = 0;
+ }
+
+ if ((zp->zp_mode >> 12) != 0x4) {
+ return (ENOTDIR);
+ }
+
+ parentnum = objnum;
+ rc = zap_lookup(spa, &dn, element, &objnum);
+ if (rc)
+ return (rc);
+ objnum = ZFS_DIRENT_OBJ(objnum);
+
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+ if (rc)
+ return (rc);
+
+ /*
+ * Check for symlink.
+ */
+ if ((zp->zp_mode >> 12) == 0xa) {
+ if (symlinks_followed > 10)
+ return (EMLINK);
+ symlinks_followed++;
+
+ /*
+ * Read the link value and copy the tail of our
+ * current path onto the end.
+ */
+ if (p)
+ strcpy(&path[zp->zp_size], p);
+ else
+ path[zp->zp_size] = 0;
+ if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
+ memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
+ zp->zp_size);
+ } else {
+ rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
+ if (rc)
+ return (rc);
+ }
+
+ /*
+ * Restart with the new path, starting either at
+ * the root or at the parent depending whether or
+ * not the link is relative.
+ */
+ p = path;
+ if (*p == '/')
+ objnum = rootnum;
+ else
+ objnum = parentnum;
+ objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+ }
+ }
+
+ *dnode = dn;
+ return (0);
+}
diff --git a/sys/cddl/boot/zfs/README b/sys/cddl/boot/zfs/README
new file mode 100644
index 000000000000..4b6218165ad4
--- /dev/null
+++ b/sys/cddl/boot/zfs/README
@@ -0,0 +1,14 @@
+$FreeBSD$
+
+This directory contains various files derived from CDDL sources that
+are used by the ZFS bootstrap:
+
+ fletcher.c checksum support
+ sha256.c checksum support
+ lzjb.c compression support
+ zfssubr.c mostly checksum and compression support
+ zfsimpl.h mostly describing the physical layout
+
+The files fletcher.c, lzjb.c and sha256.c are largely identical to the
+ZFS base code (with write support removed) and could be shared but
+that might complicate future imports from OpenSolaris.
diff --git a/sys/cddl/boot/zfs/fletcher.c b/sys/cddl/boot/zfs/fletcher.c
new file mode 100644
index 000000000000..2b9728d70484
--- /dev/null
+++ b/sys/cddl/boot/zfs/fletcher.c
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*#pragma ident "%Z%%M% %I% %E% SMI"*/
+
+static void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+static void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/sys/cddl/boot/zfs/lzjb.c b/sys/cddl/boot/zfs/lzjb.c
new file mode 100644
index 000000000000..1283a6c31a4b
--- /dev/null
+++ b/sys/cddl/boot/zfs/lzjb.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*#pragma ident "%Z%%M% %I% %E% SMI"*/
+
+/*
+ * We keep our own copy of this algorithm for 2 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 256
+
+/*ARGSUSED*/
+static int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ unsigned char *src = s_start;
+ unsigned char *dst = d_start;
+ unsigned char *d_end = (unsigned char *)d_start + d_len;
+ unsigned char *cpy, copymap = 0;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (unsigned char *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/sys/cddl/boot/zfs/sha256.c b/sys/cddl/boot/zfs/sha256.c
new file mode 100644
index 000000000000..f0d83acf557c
--- /dev/null
+++ b/sys/cddl/boot/zfs/sha256.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*#pragma ident "%Z%%M% %I% %E% SMI"*/
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+ * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
+#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+ for (t = 0; t < 16; t++, cp += 4)
+ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+ for (t = 16; t < 64; t++)
+ W[t] = sigma1(W[t - 2]) + W[t - 7] +
+ sigma0(W[t - 15]) + W[t - 16];
+
+ a = H[0]; b = H[1]; c = H[2]; d = H[3];
+ e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+ for (t = 0; t < 64; t++) {
+ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+ T2 = SIGMA0(a) + Maj(a, b, c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+ H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+static void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+ uint8_t pad[128];
+ int padsize = size & 63;
+ int i;
+
+ for (i = 0; i < size - padsize; i += 64)
+ SHA256Transform(H, (uint8_t *)buf + i);
+
+ for (i = 0; i < padsize; i++)
+ pad[i] = ((uint8_t *)buf)[i];
+
+ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+ pad[padsize] = 0;
+
+ for (i = 0; i < 8; i++)
+ pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+ for (i = 0; i < padsize; i += 64)
+ SHA256Transform(H, pad + i);
+
+ ZIO_SET_CHECKSUM(zcp,
+ (uint64_t)H[0] << 32 | H[1],
+ (uint64_t)H[2] << 32 | H[3],
+ (uint64_t)H[4] << 32 | H[5],
+ (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
new file mode 100644
index 000000000000..3d178b493664
--- /dev/null
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -0,0 +1,1151 @@
+/*-
+ * Copyright (c) 2002 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and McAfee Research,, the Security Research Division of
+ * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as
+ * part of the DARPA CHATS research program
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+
+/*
+ * Macros for various sorts of alignment and rounding when the alignment
+ * is known to be a power of 2.
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
+#define P2END(x, align) (-(~(x) & -(align)))
+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_MAXBLOCKSHIFT 17
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * E endianness
+ * type DMU object type
+ * lvl level of indirection
+ * birth txg transaction group in which the block was born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+ dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[3]; /* Extra space for the future */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_HOLE(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_pad[2] = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+
+typedef struct zio_block_tail {
+ uint64_t zbt_magic; /* for validation, endianness */
+ zio_cksum_t zbt_cksum; /* 256-bit checksum */
+} zio_block_tail_t;
+
+#define VDEV_SKIP_SIZE (8 << 10)
+#define VDEV_BOOT_HEADER_SIZE (8 << 10)
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+/* ZFS boot block */
+#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
+#define VDEV_BOOT_VERSION 1 /* version number */
+
+typedef struct vdev_boot_header {
+ uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
+ uint64_t vb_version; /* VDEV_BOOT_VERSION */
+ uint64_t vb_offset; /* start offset (bytes) */
+ uint64_t vb_size; /* size (bytes) */
+ char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+ zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+ char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
+ vdev_boot_header_t vl_boot_header; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+/* nvlist pack encoding */
+#define NV_ENCODE_NATIVE 0
+#define NV_ENCODE_XDR 1
+
+typedef enum {
+ DATA_TYPE_UNKNOWN = 0,
+ DATA_TYPE_BOOLEAN,
+ DATA_TYPE_BYTE,
+ DATA_TYPE_INT16,
+ DATA_TYPE_UINT16,
+ DATA_TYPE_INT32,
+ DATA_TYPE_UINT32,
+ DATA_TYPE_INT64,
+ DATA_TYPE_UINT64,
+ DATA_TYPE_STRING,
+ DATA_TYPE_BYTE_ARRAY,
+ DATA_TYPE_INT16_ARRAY,
+ DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_INT32_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
+ DATA_TYPE_INT64_ARRAY,
+ DATA_TYPE_UINT64_ARRAY,
+ DATA_TYPE_STRING_ARRAY,
+ DATA_TYPE_HRTIME,
+ DATA_TYPE_NVLIST,
+ DATA_TYPE_NVLIST_ARRAY,
+ DATA_TYPE_BOOLEAN_VALUE,
+ DATA_TYPE_INT8,
+ DATA_TYPE_UINT8,
+ DATA_TYPE_BOOLEAN_ARRAY,
+ DATA_TYPE_INT8_ARRAY,
+ DATA_TYPE_UINT8_ARRAY
+} data_type_t;
+
+/*
+ * On-disk version number.
+ */
+#define ZFS_VERSION_1 1ULL
+#define ZFS_VERSION_2 2ULL
+#define ZFS_VERSION_3 3ULL
+#define ZFS_VERSION_4 4ULL
+#define ZFS_VERSION_5 5ULL
+#define ZFS_VERSION_6 6ULL
+/*
+ * When bumping up ZFS_VERSION, make sure GRUB ZFS understand the on-disk
+ * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes.
+ */
+#define ZFS_VERSION ZFS_VERSION_6
+#define ZFS_VERSION_STRING "6"
+
+/*
+ * Symbolic names for the changes that caused a ZFS_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current ZFS_VERSION in your code, be sure
+ * to use spa_version() since it reports the version of the
+ * last synced uberblock. Checking the in-flight version can
+ * be dangerous in some cases.
+ */
+#define ZFS_VERSION_INITIAL ZFS_VERSION_1
+#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2
+#define ZFS_VERSION_SPARES ZFS_VERSION_3
+#define ZFS_VERSION_RAID6 ZFS_VERSION_3
+#define ZFS_VERSION_BPLIST_ACCOUNT ZFS_VERSION_3
+#define ZFS_VERSION_RAIDZ_DEFLATE ZFS_VERSION_3
+#define ZFS_VERSION_DNODE_BYTES ZFS_VERSION_3
+#define ZFS_VERSION_ZPOOL_HISTORY ZFS_VERSION_4
+#define ZFS_VERSION_GZIP_COMPRESSION ZFS_VERSION_5
+#define ZFS_VERSION_BOOTFS ZFS_VERSION_6
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration.
+ */
+#define ZPOOL_CONFIG_VERSION "version"
+#define ZPOOL_CONFIG_POOL_NAME "name"
+#define ZPOOL_CONFIG_POOL_STATE "state"
+#define ZPOOL_CONFIG_POOL_TXG "txg"
+#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
+#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
+#define ZPOOL_CONFIG_TOP_GUID "top_guid"
+#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
+#define ZPOOL_CONFIG_TYPE "type"
+#define ZPOOL_CONFIG_CHILDREN "children"
+#define ZPOOL_CONFIG_ID "id"
+#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_PATH "path"
+#define ZPOOL_CONFIG_DEVID "devid"
+#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
+#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
+#define ZPOOL_CONFIG_ASHIFT "ashift"
+#define ZPOOL_CONFIG_ASIZE "asize"
+#define ZPOOL_CONFIG_DTL "DTL"
+#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
+#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
+#define ZPOOL_CONFIG_SPARES "spares"
+#define ZPOOL_CONFIG_IS_SPARE "is_spare"
+#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_HOSTID "hostid"
+#define ZPOOL_CONFIG_HOSTNAME "hostname"
+#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
+
+#define VDEV_TYPE_ROOT "root"
+#define VDEV_TYPE_MIRROR "mirror"
+#define VDEV_TYPE_REPLACING "replacing"
+#define VDEV_TYPE_RAIDZ "raidz"
+#define VDEV_TYPE_DISK "disk"
+#define VDEV_TYPE_FILE "file"
+#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_SPARE "spare"
+
+/*
+ * This is needed in userland to report the minimum necessary device size.
+ */
+#define SPA_MINDEVSIZE (64ULL << 20)
+
+/*
+ * The location of the pool configuration repository, shared between kernel and
+ * userland.
+ */
+#define ZPOOL_CACHE_DIR "/boot/zfs"
+#define ZPOOL_CACHE_FILE "zpool.cache"
+#define ZPOOL_CACHE_TMP ".zpool.cache"
+
+#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE
+
+/*
+ * vdev states are ordered from least to most healthy.
+ * A vdev that's CANT_OPEN or below is considered unusable.
+ */
+typedef enum vdev_state {
+ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
+ VDEV_STATE_CLOSED, /* Not currently open */
+ VDEV_STATE_OFFLINE, /* Not allowed to open */
+ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
+ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
+ VDEV_STATE_HEALTHY /* Presumed good */
+} vdev_state_t;
+
+/*
+ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
+ * of the vdev stats structure uses these constants to distinguish why.
+ */
+typedef enum vdev_aux {
+ VDEV_AUX_NONE, /* no error */
+ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
+ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
+ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
+ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
+ VDEV_AUX_TOO_SMALL, /* vdev size is too small */
+ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
+ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
+ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
+ VDEV_AUX_SPARED /* hot spare used in another pool */
+} vdev_aux_t;
+
+/*
+ * pool state. The following states are written to disk as part of the normal
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
+ * software abstractions used at various levels to communicate pool state.
+ */
+typedef enum pool_state {
+ POOL_STATE_ACTIVE = 0, /* In active use */
+ POOL_STATE_EXPORTED, /* Explicitly exported */
+ POOL_STATE_DESTROYED, /* Explicitly destroyed */
+ POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
+ POOL_STATE_UNAVAIL, /* Internal libzfs state */
+ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
+} pool_state_t;
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* ZFS_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+};
+
+/*
+ * Flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 10 /* 1k */
+#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define DNODE_SIZE (1 << DNODE_SHIFT)
+#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_pad2[4];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ uint64_t dn_pad3[4];
+
+ blkptr_t dn_blkptr[1];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPLIST, /* UINT64 */
+ DMU_OT_BPLIST_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+
+ DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_pad[5];
+} zil_header_t;
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+ sizeof (uint64_t)];
+} objset_phys_t;
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_clone_parent_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj;
+ uint64_t ds_prev_snap_obj;
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj;
+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj;
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags;
+ blkptr_t ds_bp;
+ uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift)
+
+#define ZAP_MAXCD (uint32_t)(-1)
+#define ZAP_HASHBITS 28
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
+#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_pad[6];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)(zap)->zap_phys) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct fat_zap {
+ int zap_block_shift; /* block size shift */
+ zap_phys_t *zap_phys;
+} fat_zap_t;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *) \
+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_pad2[12];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_int_size; /* size of ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_length; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ int l_bs; /* block size shift */
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * special attributes for master node.
+ */
+
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_OBJ "VERSION"
+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
+
+#define ZFS_FLAG_BLOCKPERPAGE 0x1
+#define ZFS_FLAG_NOGROWBLOCKS 0x2
+
+/*
+ * ZPL version - rev'd whenever an incompatible on-disk format change
+ * occurs. Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define ZPL_VERSION 1ULL
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
+
+typedef struct ace {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+} ace_t;
+
+#define ACE_SLOT_CNT 6
+
+typedef struct zfs_znode_acl {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+/*
+ * This is the persistent portion of the znode. It is stored
+ * in the "bonus buffer" of the file. Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_pad[4]; /* 144 - future */
+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we only use this space to store symbolic links.
+ */
+} znode_phys_t;
+
+/*
+ * In-core vdev representation.
+ */
+struct vdev;
+typedef int vdev_read_t(struct vdev *vdev, void *priv, off_t offset, void *buf, size_t bytes);
+
+typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t;
+
+typedef struct vdev {
+ STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */
+ STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */
+ vdev_list_t v_children; /* children of this vdev */
+ char *v_name; /* vdev name */
+ uint64_t v_guid; /* vdev guid */
+ int v_id; /* index in parent */
+ vdev_state_t v_state; /* current state */
+ vdev_read_t *v_read; /* function to read from this vdev */
+ void *v_read_priv; /* private data for read function */
+} vdev_t;
+
+/*
+ * In-core pool representation.
+ */
+typedef STAILQ_HEAD(spa_list, spa) spa_list_t;
+
+typedef struct spa {
+ STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
+ char *spa_name; /* pool name */
+ uint64_t spa_guid; /* pool guid */
+ uint64_t spa_txg; /* most recent transaction */
+ struct uberblock spa_uberblock; /* best uberblock so far */
+ vdev_list_t spa_vdevs; /* list of all toplevel vdevs */
+ objset_phys_t spa_mos; /* MOS for this pool */
+ objset_phys_t spa_root_objset; /* current mounted ZPL objset */
+} spa_t;
diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c
new file mode 100644
index 000000000000..1c859c0f995b
--- /dev/null
+++ b/sys/cddl/boot/zfs/zfssubr.c
@@ -0,0 +1,193 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+static uint64_t zfs_crc64_table[256];
+
+static void
+zfs_init_crc(void)
+{
+ int i, j;
+ uint64_t *ct;
+
+ /*
+ * Calculate the crc64 table (used for the zap hash
+ * function).
+ */
+ if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
+ memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+ }
+}
+
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
+ int ci_correctable; /* number of correctable bits */
+ int ci_zbt; /* uses zio block tail? */
+ const char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+#include "fletcher.c"
+#include "sha256.c"
+
+static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
+ {{zio_checksum_SHA256, NULL}, 1, 1, "label"},
+ {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"},
+ {{fletcher_2_native, NULL}, 0, 1, "zilog"},
+ {{fletcher_2_native, NULL}, 0, 0, "fletcher2"},
+ {{fletcher_4_native, NULL}, 1, 0, "fletcher4"},
+ {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"},
+};
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ zio_compress_func_t *ci_compress; /* compression function */
+ zio_decompress_func_t *ci_decompress; /* decompression function */
+ int ci_level; /* level parameter */
+ const char *ci_name; /* algorithm name */
+} zio_compress_info_t;
+
+#include "lzjb.c"
+
+/*
+ * Compression vectors.
+ */
+static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {NULL, NULL, 0, "inherit"},
+ {NULL, NULL, 0, "on"},
+ {NULL, NULL, 0, "uncompressed"},
+ {NULL, lzjb_decompress, 0, "lzjb"},
+ {NULL, NULL, 0, "empty"},
+ {NULL, NULL, 1, "gzip-1"},
+ {NULL, NULL, 2, "gzip-2"},
+ {NULL, NULL, 3, "gzip-3"},
+ {NULL, NULL, 4, "gzip-4"},
+ {NULL, NULL, 5, "gzip-5"},
+ {NULL, NULL, 6, "gzip-6"},
+ {NULL, NULL, 7, "gzip-7"},
+ {NULL, NULL, 8, "gzip-8"},
+ {NULL, NULL, 9, "gzip-9"},
+};
+
+static int
+zio_checksum_error(const blkptr_t *bp, void *data)
+{
+ zio_cksum_t zc = bp->blk_cksum;
+ unsigned int checksum = BP_GET_CHECKSUM(bp);
+ uint64_t size = BP_GET_PSIZE(bp);
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (EINVAL);
+
+ if (ci->ci_zbt) {
+ expected_cksum = zbt->zbt_cksum;
+ zbt->zbt_cksum = zc;
+ ci->ci_func[0](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ zc = expected_cksum;
+ } else {
+ /* ASSERT(!BP_IS_GANG(bp)); */
+ ci->ci_func[0](data, size, &actual_cksum);
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
+ /*printf("ZFS: read checksum failed\n");*/
+ return (EIO);
+ }
+
+ return (0);
+}
+
+static int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize)
+{
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
+ if (!ci->ci_decompress) {
+ printf("ZFS: unsupported compression algorithm %d\n", cpfunc);
+ return (EIO);
+ }
+
+ return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+}
+
+static uint64_t
+zap_hash(uint64_t salt, const char *name)
+{
+ const uint8_t *cp;
+ uint8_t c;
+ uint64_t crc = salt;
+
+ /*ASSERT(crc != 0);*/
+ /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
+ for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the onces that we first pay attention to when
+ * chosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
index 37de21f4f548..bdb8f02ac20c 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
@@ -61,6 +61,15 @@ atomic_add_64(volatile uint64_t *target, int64_t delta)
*target += delta;
mtx_unlock(&atomic_mtx);
}
+
+void
+atomic_dec_64(volatile uint64_t *target)
+{
+
+ mtx_lock(&atomic_mtx);
+ *target -= 1;
+ mtx_unlock(&atomic_mtx);
+}
#endif
uint64_t
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
index 139d018d27a7..a24ca83e4556 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
@@ -94,7 +94,7 @@ zfs_kmem_free(void *buf, size_t size __unused)
{
#ifdef KMEM_DEBUG
if (buf == NULL) {
- printf("%s: attempt to free NULL\n",__func__);
+ printf("%s: attempt to free NULL\n", __func__);
return;
}
struct kmem_item *i;
@@ -156,7 +156,7 @@ kmem_cache_create(char *name, size_t bufsize, size_t align,
cache->kc_constructor = constructor;
cache->kc_destructor = destructor;
cache->kc_private = private;
-#ifdef _KERNEL
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
constructor != NULL ? kmem_std_constructor : NULL,
destructor != NULL ? kmem_std_destructor : NULL,
@@ -171,23 +171,23 @@ kmem_cache_create(char *name, size_t bufsize, size_t align,
void
kmem_cache_destroy(kmem_cache_t *cache)
{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
uma_zdestroy(cache->kc_zone);
+#endif
kmem_free(cache, sizeof(*cache));
}
void *
kmem_cache_alloc(kmem_cache_t *cache, int flags)
{
-#ifdef _KERNEL
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
return (uma_zalloc_arg(cache->kc_zone, cache, flags));
#else
void *p;
p = kmem_alloc(cache->kc_size, flags);
- if (p != NULL) {
- kmem_std_constructor(p, cache->kc_size, cache->kc_private,
- flags);
- }
+ if (p != NULL && cache->kc_constructor != NULL)
+ kmem_std_constructor(p, cache->kc_size, cache, flags);
return (p);
#endif
}
@@ -195,10 +195,11 @@ kmem_cache_alloc(kmem_cache_t *cache, int flags)
void
kmem_cache_free(kmem_cache_t *cache, void *buf)
{
-#ifdef _KERNEL
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
uma_zfree_arg(cache->kc_zone, buf, cache);
#else
- kmem_std_destructor(buf, cache->kc_size, cache->kc_private);
+ if (cache->kc_destructor != NULL)
+ kmem_std_destructor(buf, cache->kc_size, cache);
kmem_free(buf, cache->kc_size);
#endif
}
@@ -207,7 +208,9 @@ kmem_cache_free(kmem_cache_t *cache, void *buf)
void
kmem_cache_reap_now(kmem_cache_t *cache)
{
+#ifndef KMEM_DEBUG
zone_drain(cache->kc_zone);
+#endif
}
void
@@ -253,6 +256,8 @@ kmem_show(void *dummy __unused)
printf("KMEM_DEBUG: Leaked elements:\n\n");
LIST_FOREACH(i, &kmem_items, next) {
printf("address=%p\n", i);
+ stack_print_ddb(&i->stack);
+ printf("\n");
}
}
mtx_unlock(&kmem_items_mtx);
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
new file mode 100644
index 000000000000..47df799b20b2
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
@@ -0,0 +1,112 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+
+int
+lookupname(char *dirname, enum uio_seg seg, enum symfollow follow,
+ vnode_t **dirvpp, vnode_t **compvpp)
+{
+
+ return (lookupnameat(dirname, seg, follow, dirvpp, compvpp, NULL));
+}
+
+int
+lookupnameat(char *dirname, enum uio_seg seg, enum symfollow follow,
+ vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp)
+{
+ struct nameidata nd;
+ int error, ltype;
+
+ ASSERT(dirvpp == NULL);
+
+ vref(startvp);
+ ltype = VOP_ISLOCKED(startvp);
+ VOP_UNLOCK(startvp, 0);
+ NDINIT_ATVP(&nd, LOOKUP, LOCKLEAF | MPSAFE | follow, seg, dirname,
+ startvp, curthread);
+ error = namei(&nd);
+ *compvpp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vn_lock(startvp, ltype | LK_RETRY);
+ return (error);
+}
+
+int
+traverse(vnode_t **cvpp, int lktype)
+{
+ kthread_t *td = curthread;
+ vnode_t *cvp;
+ vnode_t *tvp;
+ vfs_t *vfsp;
+ int error;
+
+ cvp = *cvpp;
+ tvp = NULL;
+
+ /*
+ * If this vnode is mounted on, then we transparently indirect
+ * to the vnode which is the root of the mounted file system.
+ * Before we do this we must check that an unmount is not in
+ * progress on this vnode.
+ */
+
+ for (;;) {
+ /*
+ * Reached the end of the mount chain?
+ */
+ vfsp = vn_mountedvfs(cvp);
+ if (vfsp == NULL)
+ break;
+ /*
+ * tvp is NULL for *cvpp vnode, which we can't unlock.
+ */
+ if (tvp != NULL)
+ vput(cvp);
+ else
+ vrele(cvp);
+
+ /*
+ * The read lock must be held across the call to VFS_ROOT() to
+ * prevent a concurrent unmount from destroying the vfs.
+ */
+ error = VFS_ROOT(vfsp, lktype, &tvp, td);
+ if (error != 0)
+ return (error);
+ cvp = tvp;
+ }
+
+ *cvpp = cvp;
+ return (0);
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
index a89d478d70e5..279ae4c042bb 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
@@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
+#include <sys/limits.h>
#include <sys/misc.h>
#include <sys/sunddi.h>
@@ -40,17 +41,30 @@ struct opensolaris_utsname utsname = {
};
int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+ *result = strtol(str, nptr, base);
+ if (*result == 0)
+ return (EINVAL);
+ else if (*result == LONG_MIN || *result == LONG_MAX)
+ return (ERANGE);
+ return (0);
+}
+
+int
ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
{
- char *end;
if (str == hw_serial) {
*result = hostid;
return (0);
}
- *result = strtoul(str, &end, base);
+ *result = strtoul(str, nptr, base);
if (*result == 0)
return (EINVAL);
+ else if (*result == ULONG_MAX)
+ return (ERANGE);
return (0);
}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
index 272fe59e21a7..837d736d15af 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
@@ -30,9 +30,20 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/priv.h>
#include <sys/vnode.h>
+#include <sys/mntent.h>
#include <sys/mount.h>
#include <sys/stat.h>
+#include <sys/jail.h>
#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+
+int
+secpolicy_nfs(struct ucred *cred)
+{
+
+ /* TODO: Change PRIV_ROOT! */
+ return (priv_check_cred(cred, PRIV_ROOT, 0));
+}
int
secpolicy_zfs(struct ucred *cred)
@@ -62,16 +73,32 @@ secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp __unused)
return (priv_check_cred(cred, PRIV_VFS_UNMOUNT, 0));
}
+int
+secpolicy_fs_owner(struct mount *mp, struct ucred *cred)
+{
+
+ if (zfs_super_owner) {
+ if (cred->cr_uid == mp->mnt_cred->cr_uid &&
+ (!jailed(cred) ||
+ cred->cr_prison == mp->mnt_cred->cr_prison)) {
+ return (0);
+ }
+ }
+ return (priv_check_cred(cred, PRIV_VFS_MOUNT_OWNER, 0));
+}
+
/*
* This check is done in kern_link(), so we could just return 0 here.
*/
extern int hardlink_check_uid;
int
-secpolicy_basic_link(struct ucred *cred)
+secpolicy_basic_link(struct vnode *vp, struct ucred *cred)
{
if (!hardlink_check_uid)
return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
return (priv_check_cred(cred, PRIV_VFS_LINK, 0));
}
@@ -83,9 +110,11 @@ secpolicy_vnode_stky_modify(struct ucred *cred)
}
int
-secpolicy_vnode_remove(struct ucred *cred)
+secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred)
{
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0));
}
@@ -94,9 +123,11 @@ secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner,
accmode_t accmode)
{
- if ((accmode & VREAD) && priv_check_cred(cred, PRIV_VFS_READ, 0) != 0) {
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
+
+ if ((accmode & VREAD) && priv_check_cred(cred, PRIV_VFS_READ, 0) != 0)
return (EACCES);
- }
if ((accmode & VWRITE) &&
priv_check_cred(cred, PRIV_VFS_WRITE, 0) != 0) {
return (EACCES);
@@ -116,11 +147,13 @@ secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner,
}
int
-secpolicy_vnode_setdac(struct ucred *cred, uid_t owner)
+secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred, uid_t owner)
{
if (owner == cred->cr_uid)
return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0));
}
@@ -148,7 +181,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
* In the specific case of creating a set-uid root
* file, we need even more permissions.
*/
- error = secpolicy_vnode_setdac(cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
if (error)
return (error);
error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cred);
@@ -158,7 +191,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
vap->va_mode = ovap->va_mode;
}
if (mask & (AT_UID | AT_GID)) {
- error = secpolicy_vnode_setdac(cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
if (error)
return (error);
@@ -170,14 +203,16 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
!groupmember(vap->va_gid, cred))) {
- error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
- if (error)
- return (error);
+ if (secpolicy_fs_owner(vp->v_mount, cred) != 0) {
+ error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
+ if (error)
+ return (error);
+ }
}
if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
- secpolicy_setid_clear(vap, cred);
+ secpolicy_setid_clear(vap, vp, cred);
}
}
if (mask & (AT_ATIME | AT_MTIME)) {
@@ -189,7 +224,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
* If times is non-NULL, ... The caller must be the owner of
* the file or be the super-user.
*/
- error = secpolicy_vnode_setdac(cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
if (error && (vap->va_vaflags & VA_UTIMES_NULL))
error = unlocked_access(node, VWRITE, cred);
if (error)
@@ -206,25 +241,33 @@ secpolicy_vnode_create_gid(struct ucred *cred)
}
int
-secpolicy_vnode_setids_setgids(struct ucred *cred, gid_t gid)
+secpolicy_vnode_setids_setgids(struct vnode *vp, struct ucred *cred, gid_t gid)
{
- if (!groupmember(gid, cred))
- return (priv_check_cred(cred, PRIV_VFS_SETGID, 0));
- return (0);
+ if (groupmember(gid, cred))
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
+ return (priv_check_cred(cred, PRIV_VFS_SETGID, 0));
}
int
-secpolicy_vnode_setid_retain(struct ucred *cred, boolean_t issuidroot __unused)
+secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred,
+ boolean_t issuidroot __unused)
{
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
return (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0));
}
void
-secpolicy_setid_clear(struct vattr *vap, struct ucred *cred)
+secpolicy_setid_clear(struct vattr *vap, struct vnode *vp, struct ucred *cred)
{
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return;
+
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
vap->va_mask |= AT_MODE;
@@ -239,6 +282,9 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
{
int error;
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
+
/*
* Privileged processes may set the sticky bit on non-directories,
* as well as set the setgid bit on a file with a group that the process
@@ -253,9 +299,61 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
* group-id bit.
*/
if ((vap->va_mode & S_ISGID) != 0) {
- error = secpolicy_vnode_setids_setgids(cred, ovap->va_gid);
+ error = secpolicy_vnode_setids_setgids(vp, cred, ovap->va_gid);
if (error)
return (error);
}
return (0);
}
+
+int
+secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
+{
+
+ return (priv_check_cred(cr, PRIV_VFS_MOUNT, 0));
+}
+
+int
+secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner)
+{
+
+ if (owner == cred->cr_uid)
+ return (0);
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
+
+ /* XXX: vfs_suser()? */
+ return (priv_check_cred(cred, PRIV_VFS_MOUNT_OWNER, 0));
+}
+
+int
+secpolicy_vnode_chown(struct vnode *vp, cred_t *cred, boolean_t check_self)
+{
+
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ return (0);
+ return (priv_check_cred(cred, PRIV_VFS_CHOWN, 0));
+}
+
+void
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
+{
+
+ if (priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER, 0) != 0) {
+ MNT_ILOCK(vfsp);
+ vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
+ vfs_clearmntopt(vfsp, MNTOPT_SETUID);
+ vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
+ MNT_IUNLOCK(vfsp);
+ }
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
+{
+
+ return (priv_check_cred(cr, PRIV_VFS_SYSFLAGS, 0));
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
index e9120ee0a8f1..f1bb4e25b85a 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
@@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
+#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/cred.h>
#include <sys/vfs.h>
@@ -110,60 +111,12 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
}
int
-traverse(vnode_t **cvpp, int lktype)
-{
- kthread_t *td = curthread;
- vnode_t *cvp;
- vnode_t *tvp;
- vfs_t *vfsp;
- int error;
-
- cvp = *cvpp;
- tvp = NULL;
-
- /*
- * If this vnode is mounted on, then we transparently indirect
- * to the vnode which is the root of the mounted file system.
- * Before we do this we must check that an unmount is not in
- * progress on this vnode.
- */
-
- for (;;) {
- /*
- * Reached the end of the mount chain?
- */
- vfsp = vn_mountedvfs(cvp);
- if (vfsp == NULL)
- break;
- /*
- * tvp is NULL for *cvpp vnode, which we can't unlock.
- */
- if (tvp != NULL)
- vput(cvp);
- else
- vrele(cvp);
-
- /*
- * The read lock must be held across the call to VFS_ROOT() to
- * prevent a concurrent unmount from destroying the vfs.
- */
- error = VFS_ROOT(vfsp, lktype, &tvp, td);
- if (error != 0)
- return (error);
- cvp = tvp;
- }
-
- *cvpp = cvp;
- return (0);
-}
-
-int
domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
char *fspec, int fsflags)
{
struct mount *mp;
struct vfsconf *vfsp;
- struct ucred *newcr, *oldcr;
+ struct ucred *cr;
int error;
/*
@@ -203,29 +156,31 @@ domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
/*
* Set the mount level flags.
- * crdup() can sleep, so do it before acquiring a mutex.
*/
- newcr = crdup(kcred);
- MNT_ILOCK(mp);
if (fsflags & MNT_RDONLY)
mp->mnt_flag |= MNT_RDONLY;
mp->mnt_flag &=~ MNT_UPDATEMASK;
mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
/*
* Unprivileged user can trigger mounting a snapshot, but we don't want
- * him to unmount it, so we switch to privileged credentials.
+ * him to unmount it, so we switch to privileged of original mount.
*/
- oldcr = mp->mnt_cred;
- mp->mnt_cred = newcr;
+ crfree(mp->mnt_cred);
+ mp->mnt_cred = crdup(vp->v_mount->mnt_cred);
mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid;
- MNT_IUNLOCK(mp);
- crfree(oldcr);
/*
* Mount the filesystem.
* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
* get. No freeing of cn_pnbuf.
*/
+ /*
+ * XXX: This is evil, but we can't mount a snapshot as a regular user.
+ * XXX: Is is safe when snapshot is mounted from within a jail?
+ */
+ cr = td->td_ucred;
+ td->td_ucred = kcred;
error = VFS_MOUNT(mp, td);
+ td->td_ucred = cr;
if (!error) {
if (mp->mnt_opt != NULL)
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
index 3059a787d98c..8489052373d0 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/jail.h>
+#include <sys/osd.h>
#include <sys/priv.h>
#include <sys/zone.h>
@@ -52,7 +53,7 @@ typedef struct zone_dataset {
LIST_HEAD(zone_dataset_head, zone_dataset);
-static struct prison_service *zone_prison_service = NULL;
+static int zone_slot;
int
zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
@@ -60,7 +61,7 @@ zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
struct zone_dataset_head *head;
zone_dataset_t *zd, *zd2;
struct prison *pr;
- int error;
+ int dofree, error;
if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL, 0)) != 0)
return (error);
@@ -76,18 +77,33 @@ zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
return (ENOENT);
}
- head = prison_service_data_get(zone_prison_service, pr);
- LIST_FOREACH(zd2, head, zd_next) {
- if (strcmp(dataset, zd2->zd_dataset) == 0) {
- free(zd, M_ZONES);
- error = EEXIST;
- goto failure;
+ head = osd_jail_get(pr, zone_slot);
+ if (head != NULL) {
+ dofree = 0;
+ LIST_FOREACH(zd2, head, zd_next) {
+ if (strcmp(dataset, zd2->zd_dataset) == 0) {
+ free(zd, M_ZONES);
+ error = EEXIST;
+ goto end;
+ }
}
+ } else {
+ dofree = 1;
+ prison_hold_locked(pr);
+ mtx_unlock(&pr->pr_mtx);
+ head = malloc(sizeof(*head), M_ZONES, M_WAITOK);
+ LIST_INIT(head);
+ mtx_lock(&pr->pr_mtx);
+ error = osd_jail_set(pr, zone_slot, head);
+ KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", error));
}
strcpy(zd->zd_dataset, dataset);
LIST_INSERT_HEAD(head, zd, zd_next);
-failure:
- mtx_unlock(&pr->pr_mtx);
+end:
+ if (dofree)
+ prison_free_locked(pr);
+ else
+ mtx_unlock(&pr->pr_mtx);
return (error);
}
@@ -107,16 +123,25 @@ zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ENOENT);
- head = prison_service_data_get(zone_prison_service, pr);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL) {
+ error = ENOENT;
+ goto end;
+ }
LIST_FOREACH(zd, head, zd_next) {
- if (strcmp(dataset, zd->zd_dataset) == 0) {
- LIST_REMOVE(zd, zd_next);
- free(zd, M_ZONES);
- goto success;
- }
+ if (strcmp(dataset, zd->zd_dataset) == 0)
+ break;
}
- error = ENOENT;
-success:
+ if (zd == NULL)
+ error = ENOENT;
+ else {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ if (LIST_EMPTY(head))
+ osd_jail_del(pr, zone_slot);
+ error = 0;
+ }
+end:
mtx_unlock(&pr->pr_mtx);
return (error);
}
@@ -136,14 +161,16 @@ zone_dataset_visible(const char *dataset, int *write)
if (dataset[0] == '\0')
return (0);
- if (INGLOBALZONE(curproc)) {
+ if (INGLOBALZONE(curthread)) {
if (write != NULL)
*write = 1;
return (1);
}
pr = curthread->td_ucred->cr_prison;
mtx_lock(&pr->pr_mtx);
- head = prison_service_data_get(zone_prison_service, pr);
+ head = osd_jail_get(pr, zone_slot);
+ if (head == NULL)
+ goto end;
/*
* Walk the list once, looking for datasets which match exactly, or
@@ -188,49 +215,32 @@ end:
return (ret);
}
-static int
-zone_create(struct prison_service *psrv, struct prison *pr)
-{
- struct zone_dataset_head *head;
-
- head = malloc(sizeof(*head), M_ZONES, M_WAITOK);
- LIST_INIT(head);
- mtx_lock(&pr->pr_mtx);
- prison_service_data_set(psrv, pr, head);
- mtx_unlock(&pr->pr_mtx);
- return (0);
-}
-
-static int
-zone_destroy(struct prison_service *psrv, struct prison *pr)
+static void
+zone_destroy(void *arg)
{
struct zone_dataset_head *head;
zone_dataset_t *zd;
- mtx_lock(&pr->pr_mtx);
- head = prison_service_data_del(psrv, pr);
- mtx_unlock(&pr->pr_mtx);
- while ((zd = LIST_FIRST(head)) != NULL) {
- LIST_REMOVE(zd, zd_next);
- free(zd, M_ZONES);
- }
- free(head, M_ZONES);
- return (0);
+ head = arg;
+ while ((zd = LIST_FIRST(head)) != NULL) {
+ LIST_REMOVE(zd, zd_next);
+ free(zd, M_ZONES);
+ }
+ free(head, M_ZONES);
}
static void
zone_sysinit(void *arg __unused)
{
- zone_prison_service = prison_service_register("zfs", zone_create,
- zone_destroy);
+ zone_slot = osd_jail_register(zone_destroy);
}
static void
zone_sysuninit(void *arg __unused)
{
- prison_service_deregister(zone_prison_service);
+ osd_jail_deregister(zone_slot);
}
SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h
index 46752a58995d..1a8bd45964d7 100644
--- a/sys/cddl/compat/opensolaris/sys/atomic.h
+++ b/sys/cddl/compat/opensolaris/sys/atomic.h
@@ -38,6 +38,7 @@
#ifndef __LP64__
extern void atomic_add_64(volatile uint64_t *target, int64_t delta);
+extern void atomic_dec_64(volatile uint64_t *target);
extern void *atomic_cas_ptr(volatile void *target, void *cmp, void *newval);
#endif
#ifndef __sparc64__
@@ -83,6 +84,14 @@ atomic_dec_32_nv(volatile uint32_t *target)
return (atomic_fetchadd_32(target, -1) - 1);
}
+#ifdef __LP64__
+static __inline void
+atomic_dec_64(volatile uint64_t *target)
+{
+ atomic_subtract_64(target, 1);
+}
+#endif
+
static __inline void
atomic_inc_32(volatile uint32_t *target)
{
diff --git a/sys/cddl/compat/opensolaris/sys/callb.h b/sys/cddl/compat/opensolaris/sys/callb.h
deleted file mode 100644
index 070d0f973bf9..000000000000
--- a/sys/cddl/compat/opensolaris/sys/callb.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD$
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_CALLB_H
-#define _SYS_CALLB_H
-
-#pragma ident "@(#)callb.h 1.29 05/06/23 SMI"
-
-#include <sys/kcondvar.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * definitions of callback classes (c_class)
- *
- * Callbacks belong in the same class if (1) their callback routines
- * do the same kind of processing (ideally, using the same callback function)
- * and (2) they can/should be executed at the same time in a cpr
- * suspend/resume operation.
- *
- * Note: The DAEMON class, in particular, is for stopping kernel threads
- * and nothing else. The CALLB_* macros below should be used to deal
- * with kernel threads, and the callback function should be callb_generic_cpr.
- * Another idiosyncrasy of the DAEMON class is that if a suspend operation
- * fails, some of the callback functions may be called with the RESUME
- * code which were never called with SUSPEND. Not a problem currently,
- * but see bug 4201851.
- */
-#define CB_CL_CPR_DAEMON 0
-#define CB_CL_CPR_VM 1
-#define CB_CL_CPR_CALLOUT 2
-#define CB_CL_CPR_OBP 3
-#define CB_CL_CPR_FB 4
-#define CB_CL_PANIC 5
-#define CB_CL_CPR_RPC 6
-#define CB_CL_CPR_PROMPRINTF 7
-#define CB_CL_UADMIN 8
-#define CB_CL_CPR_PM 9
-#define CB_CL_HALT 10
-#define CB_CL_CPR_DMA 11
-#define CB_CL_CPR_POST_USER 12
-#define CB_CL_UADMIN_PRE_VFS 13
-#define CB_CL_MDBOOT CB_CL_UADMIN
-#define CB_CL_ENTER_DEBUGGER 14
-#define CB_CL_CPR_POST_KERNEL 15
-#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
-
-/*
- * CB_CL_CPR_DAEMON class specific definitions are given below:
- */
-
-/*
- * code for CPR callb_execute_class
- */
-#define CB_CODE_CPR_CHKPT 0
-#define CB_CODE_CPR_RESUME 1
-
-typedef void * callb_id_t;
-/*
- * Per kernel thread structure for CPR daemon callbacks.
- * Must be protected by either a existing lock in the daemon or
- * a new lock created for such a purpose.
- */
-typedef struct callb_cpr {
- kmutex_t *cc_lockp; /* lock to protect this struct */
- char cc_events; /* various events for CPR */
- callb_id_t cc_id; /* callb id address */
- kcondvar_t cc_callb_cv; /* cv for callback waiting */
- kcondvar_t cc_stop_cv; /* cv to checkpoint block */
-} callb_cpr_t;
-
-/*
- * cc_events definitions
- */
-#define CALLB_CPR_START 1 /* a checkpoint request's started */
-#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */
-#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */
-
-/*
- * Used when checking that all kernel threads are stopped.
- */
-#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */
-#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */
-#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */
- /* due to pwr mgmt of disks, make -- */
- /* big enough for worst spinup time */
-
-#ifdef _KERNEL
-/*
- *
- * CALLB_CPR_INIT macro is used by kernel threads to add their entry to
- * the callback table and perform other initialization. It automatically
- * adds the thread as being in the callback class CB_CL_CPR_DAEMON.
- *
- * cp - ptr to the callb_cpr_t structure for this kernel thread
- *
- * lockp - pointer to mutex protecting the callb_cpr_t stuct
- *
- * func - pointer to the callback function for this kernel thread.
- * It has the prototype boolean_t <func>(void *arg, int code)
- * where: arg - ptr to the callb_cpr_t structure
- * code - not used for this type of callback
- * returns: B_TRUE if successful; B_FALSE if unsuccessful.
- *
- * name - a string giving the name of the kernel thread
- *
- * Note: lockp is the lock to protect the callb_cpr_t (cp) structure
- * later on. No lock held is needed for this initialization.
- */
-#define CALLB_CPR_INIT(cp, lockp, func, name) { \
- strlcpy(curthread->td_name, (name), \
- sizeof(curthread->td_name)); \
- strlcpy(curthread->td_proc->p_comm, (name), \
- sizeof(curthread->td_proc->p_comm)); \
- bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \
- (cp)->cc_lockp = lockp; \
- (cp)->cc_id = callb_add(func, (void *)(cp), \
- CB_CL_CPR_DAEMON, name); \
- }
-
-#ifndef __lock_lint
-#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
-#else
-#define CALLB_CPR_ASSERT(cp)
-#endif
-/*
- * Some threads (like the idle threads) do not adhere to the callback
- * protocol and are always considered safe. Such threads must never exit.
- * They register their presence by calling this macro during their
- * initialization.
- *
- * Args:
- * t - thread pointer of the client kernel thread
- * name - a string giving the name of the kernel thread
- */
-#define CALLB_CPR_INIT_SAFE(t, name) { \
- (void) callb_add_thread(callb_generic_cpr_safe, \
- (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \
- name, t); \
- }
-/*
- * The lock to protect cp's content must be held before
- * calling the following two macros.
- *
- * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END
- * is safe for checkpoint/resume.
- */
-#define CALLB_CPR_SAFE_BEGIN(cp) { \
- CALLB_CPR_ASSERT(cp) \
- (cp)->cc_events |= CALLB_CPR_SAFE; \
- if ((cp)->cc_events & CALLB_CPR_START) \
- cv_signal(&(cp)->cc_callb_cv); \
- }
-#define CALLB_CPR_SAFE_END(cp, lockp) { \
- CALLB_CPR_ASSERT(cp) \
- while ((cp)->cc_events & CALLB_CPR_START) \
- cv_wait(&(cp)->cc_stop_cv, lockp); \
- (cp)->cc_events &= ~CALLB_CPR_SAFE; \
- }
-/*
- * cv_destroy is nop right now but may be needed in the future.
- */
-#define CALLB_CPR_EXIT(cp) { \
- CALLB_CPR_ASSERT(cp) \
- (cp)->cc_events |= CALLB_CPR_SAFE; \
- if ((cp)->cc_events & CALLB_CPR_START) \
- cv_signal(&(cp)->cc_callb_cv); \
- mutex_exit((cp)->cc_lockp); \
- (void) callb_delete((cp)->cc_id); \
- cv_destroy(&(cp)->cc_callb_cv); \
- cv_destroy(&(cp)->cc_stop_cv); \
- }
-
-extern callb_cpr_t callb_cprinfo_safe;
-extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *);
-extern callb_id_t callb_add_thread(boolean_t (*)(void *, int),
- void *, int, char *, kthread_id_t);
-extern int callb_delete(callb_id_t);
-extern void callb_execute(callb_id_t, int);
-extern void *callb_execute_class(int, int);
-extern boolean_t callb_generic_cpr(void *, int);
-extern boolean_t callb_generic_cpr_safe(void *, int);
-extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *);
-extern void callb_lock_table(void);
-extern void callb_unlock_table(void);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_CALLB_H */
diff --git a/sys/cddl/compat/opensolaris/sys/cred.h b/sys/cddl/compat/opensolaris/sys/cred.h
index 85e79db81eda..b13ef6c62874 100644
--- a/sys/cddl/compat/opensolaris/sys/cred.h
+++ b/sys/cddl/compat/opensolaris/sys/cred.h
@@ -30,12 +30,14 @@
#define _OPENSOLARIS_SYS_CRED_H_
#include <sys/param.h>
-#include_next <sys/ucred.h>
-
-#ifdef _KERNEL
+#define _WANT_UCRED
+#include <sys/ucred.h>
+#undef _WANT_UCRED
typedef struct ucred cred_t;
+typedef struct ucred ucred_t;
+#ifdef _KERNEL
#define CRED() (curthread->td_ucred)
/*
@@ -43,9 +45,14 @@ typedef struct ucred cred_t;
*/
#define kcred (thread0.td_ucred)
-#define crgetuid(cred) ((cred)->cr_uid)
-#define crgetgid(cred) ((cred)->cr_gid)
-
-#endif /* _KERNEL */
+#define crgetuid(cred) ((cred)->cr_uid)
+#define crgetgid(cred) ((cred)->cr_gid)
+#define crgetgroups(cred) ((cred)->cr_groups)
+#define crgetngroups(cred) ((cred)->cr_ngroups)
+#define crgetsid(cred, i) (NULL)
+#else /* !_KERNEL */
+#define kcred NULL
+#define CRED() NULL
+#endif /* !_KERNEL */
#endif /* _OPENSOLARIS_SYS_CRED_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/dnlc.h b/sys/cddl/compat/opensolaris/sys/dnlc.h
index a2d4f01263fc..e978e975bc8a 100644
--- a/sys/cddl/compat/opensolaris/sys/dnlc.h
+++ b/sys/cddl/compat/opensolaris/sys/dnlc.h
@@ -35,6 +35,6 @@
#define dnlc_update(dvp, name, vp) do { } while (0)
#define dnlc_remove(dvp, name) do { } while (0)
#define dnlc_purge_vfsp(vfsp, count) (0)
-#define dnlc_reduce_cache(percent) do { } while (0)
+#define dnlc_reduce_cache(percent) EVENTHANDLER_INVOKE(vfs_lowvnodes, (int)(intptr_t)(percent))
#endif /* !_OPENSOLARIS_SYS_DNLC_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/file.h b/sys/cddl/compat/opensolaris/sys/file.h
new file mode 100644
index 000000000000..afd10501d016
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/file.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_FILE_H_
+#define _OPENSOLARIS_SYS_FILE_H_
+
+#include_next <sys/file.h>
+
+#ifdef _KERNEL
+typedef struct file file_t;
+
+static __inline file_t *
+getf(int fd, int write)
+{
+ struct file *fp;
+
+ if (write && fget_write(curthread, fd, &fp) == 0)
+ return (fp);
+ else if (!write && fget_read(curthread, fd, &fp) == 0)
+ return (fp);
+ return (NULL);
+}
+
+static __inline void
+releasef(file_t *fp)
+{
+
+ fdrop(fp, curthread);
+}
+#endif /* _KERNEL */
+
+#endif /* !_OPENSOLARIS_SYS_FILE_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/kidmap.h b/sys/cddl/compat/opensolaris/sys/kidmap.h
new file mode 100644
index 000000000000..c2a33d2d3ebf
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/kidmap.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_KIDMAP_H_
+#define _OPENSOLARIS_SYS_KIDMAP_H_
+
+#include <sys/idmap.h>
+
+typedef int32_t idmap_stat;
+typedef void idmap_get_handle_t;
+
+#define kidmap_get_create() (NULL)
+#define kidmap_get_destroy(hdl) do { } while (0)
+#define kidmap_get_mappings(hdl) (NULL)
+
+#endif /* _OPENSOLARIS_SYS_KIDMAP_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/kmem.h b/sys/cddl/compat/opensolaris/sys/kmem.h
index 5258cffe4edf..c103d18b4e3b 100644
--- a/sys/cddl/compat/opensolaris/sys/kmem.h
+++ b/sys/cddl/compat/opensolaris/sys/kmem.h
@@ -38,15 +38,16 @@
#include <vm/vm_extern.h>
#define KM_SLEEP M_WAITOK
+#define KM_PUSHPAGE M_WAITOK
#define KM_NOSLEEP M_NOWAIT
#define KMC_NODEBUG 0
typedef struct kmem_cache {
char kc_name[32];
-#ifdef _KERNEL
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
uma_zone_t kc_zone;
#else
- size_t size;
+ size_t kc_size;
#endif
int (*kc_constructor)(void *, void *, int);
void (*kc_destructor)(void *, void *);
diff --git a/sys/cddl/compat/opensolaris/sys/misc.h b/sys/cddl/compat/opensolaris/sys/misc.h
index a5a52b786eec..8e1a637a3b68 100644
--- a/sys/cddl/compat/opensolaris/sys/misc.h
+++ b/sys/cddl/compat/opensolaris/sys/misc.h
@@ -29,6 +29,13 @@
#ifndef _OPENSOLARIS_SYS_MISC_H_
#define _OPENSOLARIS_SYS_MISC_H_
+#define MAXUID 2147483647
+
+#define SPEC_MAXOFFSET_T OFF_MAX
+
+#define _ACL_ACLENT_ENABLED 0x1
+#define _ACL_ACE_ENABLED 0x2
+
#define _FIOFFS (INT_MIN)
#define _FIOGDIO (INT_MIN+1)
#define _FIOSDIO (INT_MIN+2)
diff --git a/sys/cddl/compat/opensolaris/sys/mntent.h b/sys/cddl/compat/opensolaris/sys/mntent.h
index e4bbc9da225b..3faea6b73430 100644
--- a/sys/cddl/compat/opensolaris/sys/mntent.h
+++ b/sys/cddl/compat/opensolaris/sys/mntent.h
@@ -54,5 +54,7 @@
#define MNTOPT_EXEC "exec" /* enable executables */
#define MNTOPT_NOEXEC "noexec" /* disable executables */
#define MNTOPT_RESTRICT "restrict" /* restricted autofs mount */
+#define MNTOPT_NBMAND "nbmand" /* allow non-blocking mandatory locks */
+#define MNTOPT_NONBMAND "nonbmand" /* deny non-blocking mandatory locks */
#endif /* !_OPENSOLARIS_MNTENT_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/param.h b/sys/cddl/compat/opensolaris/sys/param.h
index 8d36a9d07700..609d22afe8c1 100644
--- a/sys/cddl/compat/opensolaris/sys/param.h
+++ b/sys/cddl/compat/opensolaris/sys/param.h
@@ -34,4 +34,8 @@
#define PAGESIZE PAGE_SIZE
+#ifdef _KERNEL
+#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT)
+#endif
+
#endif
diff --git a/sys/cddl/compat/opensolaris/sys/pathname.h b/sys/cddl/compat/opensolaris/sys/pathname.h
new file mode 100644
index 000000000000..0d396231c65d
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/pathname.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_PATHNAME_H_
+#define _OPENSOLARIS_SYS_PATHNAME_H_
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+typedef struct pathname {
+ char *pn_buf; /* underlying storage */
+ char *pn_path; /* remaining pathname */
+ size_t pn_pathlen; /* remaining length */
+ size_t pn_bufsize; /* total size of pn_buf */
+} pathname_t;
+
+#define pn_alloc(pnp) panic("pn_alloc() called")
+#define pn_free(pnp) panic("pn_free() called")
+
+int lookupname(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **);
+int lookupnameat(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **,
+ vnode_t *);
+int traverse(vnode_t **, int);
+
+#endif /* _KERNEL */
+
+#endif /* _OPENSOLARIS_SYS_PATHNAME_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/policy.h b/sys/cddl/compat/opensolaris/sys/policy.h
index 2c764ef29c10..08db5ca763d7 100644
--- a/sys/cddl/compat/opensolaris/sys/policy.h
+++ b/sys/cddl/compat/opensolaris/sys/policy.h
@@ -33,30 +33,44 @@
#ifdef _KERNEL
+#include <sys/vnode.h>
+
struct mount;
struct ucred;
struct vattr;
struct vnode;
-int secpolicy_zfs(struct ucred *cred);
-int secpolicy_sys_config(struct ucred *cred, int checkonly);
-int secpolicy_zinject(struct ucred *cred);
-int secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp);
-int secpolicy_basic_link(struct ucred *cred);
+int secpolicy_nfs(struct ucred *cred);
+int secpolicy_zfs(struct ucred *cred);
+int secpolicy_sys_config(struct ucred *cred, int checkonly);
+int secpolicy_zinject(struct ucred *cred);
+int secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp);
+int secpolicy_basic_link(struct vnode *vp, struct ucred *cred);
+int secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner);
+int secpolicy_vnode_chown(struct vnode *vp, cred_t *cred,
+ boolean_t check_self);
int secpolicy_vnode_stky_modify(struct ucred *cred);
-int secpolicy_vnode_remove(struct ucred *cred);
+int secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred);
int secpolicy_vnode_access(struct ucred *cred, struct vnode *vp,
uint64_t owner, accmode_t accmode);
-int secpolicy_vnode_setdac(struct ucred *cred, uid_t owner);
+int secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred,
+ uid_t owner);
int secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp,
struct vattr *vap, const struct vattr *ovap, int flags,
int unlocked_access(void *, int, struct ucred *), void *node);
int secpolicy_vnode_create_gid(struct ucred *cred);
-int secpolicy_vnode_setids_setgids(struct ucred *cred, gid_t gid);
-int secpolicy_vnode_setid_retain(struct ucred *cred, boolean_t issuidroot);
-void secpolicy_setid_clear(struct vattr *vap, struct ucred *cred);
+int secpolicy_vnode_setids_setgids(struct vnode *vp, struct ucred *cred,
+ gid_t gid);
+int secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred,
+ boolean_t issuidroot);
+void secpolicy_setid_clear(struct vattr *vap, struct vnode *vp,
+ struct ucred *cred);
int secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
const struct vattr *ovap, struct ucred *cred);
+int secpolicy_fs_owner(struct mount *vfsp, struct ucred *cred);
+int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp);
+void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp);
+int secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype);
#endif /* _KERNEL */
diff --git a/sys/cddl/compat/opensolaris/sys/proc.h b/sys/cddl/compat/opensolaris/sys/proc.h
index 2410396d4a9a..73fbcdadf1d2 100644
--- a/sys/cddl/compat/opensolaris/sys/proc.h
+++ b/sys/cddl/compat/opensolaris/sys/proc.h
@@ -54,12 +54,6 @@ typedef struct thread kthread_t;
typedef struct thread *kthread_id_t;
typedef struct proc proc_t;
-#if (KSTACK_PAGES * PAGE_SIZE) < 16384
-#define ZFS_KSTACK_PAGES (16384 / PAGE_SIZE)
-#else
-#define ZFS_KSTACK_PAGES 0
-#endif
-
static __inline kthread_t *
thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg,
size_t len, proc_t *pp, int state, pri_t pri)
@@ -71,11 +65,10 @@ thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg,
* Be sure there are no surprises.
*/
ASSERT(stk == NULL);
- ASSERT(stksize == 0);
ASSERT(len == 0);
ASSERT(state == TS_RUN);
- error = kproc_create(proc, arg, &p, 0, ZFS_KSTACK_PAGES,
+ error = kproc_create(proc, arg, &p, 0, stksize / PAGE_SIZE,
"solthread %p", proc);
return (error == 0 ? FIRST_THREAD_IN_PROC(p) : NULL);
}
diff --git a/sys/cddl/compat/opensolaris/sys/refstr.h b/sys/cddl/compat/opensolaris/sys/refstr.h
new file mode 100644
index 000000000000..e4e177bf435c
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/refstr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ $ $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_REFSTR_H_
+#define _OPENSOLARIS_SYS_REFSTR_H_
+
+#define refstr_value(str) (str)
+
+#endif /* _OPENSOLARIS_SYS_REFSTR_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sid.h b/sys/cddl/compat/opensolaris/sys/sid.h
new file mode 100644
index 000000000000..eb8d0bed3eeb
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/sid.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_SID_H_
+#define _OPENSOLARIS_SYS_SID_H_
+
+typedef struct ksiddomain {
+ char kd_name[16]; /* Domain part of SID */
+} ksiddomain_t;
+typedef void ksid_t;
+
+static __inline ksiddomain_t *
+ksid_lookupdomain(const char *domain)
+{
+ ksiddomain_t *kd;
+
+ kd = kmem_alloc(sizeof(*kd), KM_SLEEP);
+ strlcpy(kd->kd_name, "FreeBSD", sizeof(kd->kd_name));
+ return (kd);
+}
+
+static __inline void
+ksiddomain_rele(ksiddomain_t *kd)
+{
+
+ kmem_free(kd, sizeof(*kd));
+}
+
+#endif /* _OPENSOLARIS_SYS_SID_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sig.h b/sys/cddl/compat/opensolaris/sys/sig.h
new file mode 100644
index 000000000000..985896ee2679
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/sig.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_SIG_H_
+#define _OPENSOLARIS_SYS_SIG_H_
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/debug.h>
+
+#define FORREAL 0
+#define JUSTLOOKING 1
+
+static __inline int
+issig(int why)
+{
+ struct thread *td = curthread;
+ struct proc *p;
+ int sig;
+
+ ASSERT(why == FORREAL || why == JUSTLOOKING);
+ if (SIGPENDING(td)) {
+ if (why == JUSTLOOKING)
+ return (1);
+ p = td->td_proc;
+ PROC_LOCK(p);
+ mtx_lock(&p->p_sigacts->ps_mtx);
+ sig = cursig(td);
+ mtx_unlock(&p->p_sigacts->ps_mtx);
+ PROC_UNLOCK(p);
+ if (sig != 0)
+ return (1);
+ }
+ return (0);
+}
+
+#endif /* _KERNEL */
+
+#endif /* _OPENSOLARIS_SYS_SIG_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sunddi.h b/sys/cddl/compat/opensolaris/sys/sunddi.h
index 192d5a992c34..1ca2bf09fd6d 100644
--- a/sys/cddl/compat/opensolaris/sys/sunddi.h
+++ b/sys/cddl/compat/opensolaris/sys/sunddi.h
@@ -29,8 +29,10 @@
#ifndef _OPENSOLARIS_SYS_SUNDDI_H_
#define _OPENSOLARIS_SYS_SUNDDI_H_
+#define ddi_driver_major(zfs_dip) (0)
#define ddi_copyin(from, to, size, flag) (bcopy((from), (to), (size)), 0)
#define ddi_copyout(from, to, size, flag) (bcopy((from), (to), (size)), 0)
+int ddi_strtol(const char *str, char **nptr, int base, long *result);
int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result);
#endif /* _OPENSOLARIS_SYS_SUNDDI_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sysmacros.h b/sys/cddl/compat/opensolaris/sys/sysmacros.h
index a179c5f869da..7f4885bac600 100644
--- a/sys/cddl/compat/opensolaris/sys/sysmacros.h
+++ b/sys/cddl/compat/opensolaris/sys/sysmacros.h
@@ -39,6 +39,10 @@
extern "C" {
#endif
+#ifndef ABS
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+#endif
+
/*
* Macro for checking power of 2 address alignment.
*/
diff --git a/sys/cddl/compat/opensolaris/sys/time.h b/sys/cddl/compat/opensolaris/sys/time.h
index 770b25163191..0bf1e9bf6b82 100644
--- a/sys/cddl/compat/opensolaris/sys/time.h
+++ b/sys/cddl/compat/opensolaris/sys/time.h
@@ -40,6 +40,9 @@ typedef longlong_t hrtime_t;
#define LBOLT ((gethrtime() * hz) / NANOSEC)
+#define TIMESPEC_OVERFLOW(ts) \
+ ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX)
+
#ifdef _KERNEL
#define lbolt64 (int64_t)(LBOLT)
diff --git a/sys/cddl/compat/opensolaris/sys/types.h b/sys/cddl/compat/opensolaris/sys/types.h
index 7d5d9e41ed14..069ad45b2f32 100644
--- a/sys/cddl/compat/opensolaris/sys/types.h
+++ b/sys/cddl/compat/opensolaris/sys/types.h
@@ -44,13 +44,15 @@ typedef u_char uchar_t;
typedef u_short ushort_t;
typedef u_long ulong_t;
typedef long long longlong_t;
-typedef unsigned long long u_longlong_t;
+typedef unsigned long long u_longlong_t;
typedef off_t off64_t;
typedef id_t taskid_t;
typedef id_t projid_t;
typedef id_t poolid_t;
typedef id_t zoneid_t;
typedef id_t ctid_t;
+typedef mode_t o_mode_t;
+typedef uint64_t pgcnt_t;
#ifdef _KERNEL
@@ -60,8 +62,8 @@ typedef id_t ctid_t;
typedef short index_t;
typedef off_t offset_t;
typedef long ptrdiff_t; /* pointer difference */
-typedef void pathname_t;
typedef int64_t rlim64_t;
+typedef int major_t;
#else
#ifdef NEED_SOLARIS_BOOLEAN
@@ -80,7 +82,6 @@ typedef short pri_t;
typedef int32_t daddr32_t;
typedef int32_t time32_t;
typedef u_longlong_t diskaddr_t;
-typedef ushort_t o_mode_t; /* old file attribute type */
#endif /* !_KERNEL */
diff --git a/sys/cddl/compat/opensolaris/sys/uio.h b/sys/cddl/compat/opensolaris/sys/uio.h
index d219ff0e960c..9e53457baf2b 100644
--- a/sys/cddl/compat/opensolaris/sys/uio.h
+++ b/sys/cddl/compat/opensolaris/sys/uio.h
@@ -60,6 +60,6 @@ zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio)
return (uiomove(cp, (int)n, uio));
}
#define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio))
-#endif
+#endif /* BUILDING_ZFS */
#endif /* !_OPENSOLARIS_SYS_UIO_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/vfs.h b/sys/cddl/compat/opensolaris/sys/vfs.h
index c2d8a6b71119..be3e3cfe5fcf 100644
--- a/sys/cddl/compat/opensolaris/sys/vfs.h
+++ b/sys/cddl/compat/opensolaris/sys/vfs.h
@@ -45,6 +45,7 @@ typedef struct mount vfs_t;
#define vfs_count mnt_ref
#define vfs_fsid mnt_stat.f_fsid
#define vfs_bsize mnt_stat.f_bsize
+#define vfs_resource mnt_stat.f_mntfromname
#define v_flag v_vflag
#define v_vfsp v_mount
@@ -64,6 +65,8 @@ typedef struct mount vfs_t;
MNT_IUNLOCK(vfsp); \
} while (0)
+#define fs_vscan(vp, cr, async) (0)
+
#define VROOT VV_ROOT
/*
@@ -107,10 +110,21 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
int flags __unused);
void vfs_clearmntopt(vfs_t *vfsp, const char *name);
int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
-int traverse(vnode_t **cvpp, int lktype);
int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
char *fspec, int fsflags);
+typedef uint64_t vfs_feature_t;
+
+#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */
+#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */
+#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */
+#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */
+#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */
+#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */
+
+#define vfs_set_feature(vfsp, feature) do { } while (0)
+#define vfs_has_feature(vfsp, feature) (0)
+
#endif /* _KERNEL */
#endif /* _OPENSOLARIS_SYS_VFS_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index a8a261cc2449..b490d337d5e7 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -29,24 +29,32 @@
#ifndef _OPENSOLARIS_SYS_VNODE_H_
#define _OPENSOLARIS_SYS_VNODE_H_
+struct vnode;
+struct vattr;
+
+typedef struct vnode vnode_t;
+typedef struct vattr vattr_t;
+typedef enum vtype vtype_t;
+
+#include <sys/namei.h>
+enum symfollow { NO_FOLLOW = NOFOLLOW };
+
+#include <sys/proc.h>
#include_next <sys/vnode.h>
#include <sys/mount.h>
#include <sys/cred.h>
#include <sys/fcntl.h>
-#include <sys/namei.h>
-#include <sys/proc.h>
+#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/syscallsubr.h>
-typedef struct vnode vnode_t;
-typedef struct vattr vattr_t;
-typedef void caller_context_t;
-
typedef struct vop_vector vnodeops_t;
#define vop_fid vop_vptofh
#define vop_fid_args vop_vptofh_args
#define a_fid a_fhp
+#define IS_XATTRDIR(dvp) (0)
+
#define v_count v_usecount
static __inline int
@@ -59,23 +67,24 @@ vn_is_readonly(vnode_t *vp)
#define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
#define vn_mountedvfs(vp) ((vp)->v_mountedhere)
#define vn_has_cached_data(vp) ((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0)
+#define vn_exists(vp) do { } while (0)
+#define vn_invalid(vp) do { } while (0)
+#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)
+#define vn_free(vp) do { } while (0)
#define VN_HOLD(v) vref(v)
#define VN_RELE(v) vrele(v)
#define VN_URELE(v) vput(v)
-#define VOP_REALVP(vp, vpp) (*(vpp) = (vp), 0)
-
-#define vnevent_remove(vp) do { } while (0)
-#define vnevent_rmdir(vp) do { } while (0)
-#define vnevent_rename_src(vp) do { } while (0)
-#define vnevent_rename_dest(vp) do { } while (0)
-
+#define VOP_REALVP(vp, vpp, ct) (*(vpp) = (vp), 0)
-#define IS_DEVVP(vp) \
- ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
-
-#define MODEMASK ALLPERMS
+#define vnevent_create(vp, ct) do { } while (0)
+#define vnevent_link(vp, ct) do { } while (0)
+#define vnevent_remove(vp, dvp, name, ct) do { } while (0)
+#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0)
+#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0)
+#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0)
+#define vnevent_rename_dest_dir(vp, ct) do { } while (0)
#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp))
#define MANDMODE(mode) (0)
@@ -98,24 +107,6 @@ vn_is_readonly(vnode_t *vp)
#define MAXOFFSET_T OFF_MAX
#define EXCL 0
-#define AT_TYPE 0x0001
-#define AT_MODE 0x0002
-#define AT_UID 0x0004
-#define AT_GID 0x0008
-#define AT_FSID 0x0010
-#define AT_NODEID 0x0020
-#define AT_NLINK 0x0040
-#define AT_SIZE 0x0080
-#define AT_ATIME 0x0100
-#define AT_MTIME 0x0200
-#define AT_CTIME 0x0400
-#define AT_RDEV 0x0800
-#define AT_BLKSIZE 0x1000
-#define AT_NBLOCKS 0x2000
-#define AT_SEQ 0x4000
-#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
- AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
#define ACCESSED (AT_ATIME)
#define STATE_CHANGED (AT_CTIME)
#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
@@ -140,28 +131,37 @@ vattr_init_mask(vattr_t *vap)
vap->va_mask |= AT_MTIME;
if (vap->va_mode != (u_short)VNOVAL)
vap->va_mask |= AT_MODE;
+ if (vap->va_flags != VNOVAL)
+ vap->va_mask |= AT_XVATTR;
}
-#define FCREAT O_CREAT
-#define FTRUNC O_TRUNC
-#define FDSYNC FFSYNC
-#define FRSYNC FFSYNC
-#define FSYNC FFSYNC
-#define FOFFMAX 0x00
-
-enum create { CRCREAT };
+#define FCREAT O_CREAT
+#define FTRUNC O_TRUNC
+#define FDSYNC FFSYNC
+#define FRSYNC FFSYNC
+#define FSYNC FFSYNC
+#define FOFFMAX 0x00
+#define FIGNORECASE 0x00
static __inline int
-zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
- vnode_t **vpp, enum create crwhy, mode_t umask)
+vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode,
+ vnode_t **vpp, enum create crwhy, mode_t umask, struct vnode *startvp,
+ int fd)
{
struct thread *td = curthread;
struct nameidata nd;
- int error;
+ int error, operation;
ASSERT(seg == UIO_SYSSPACE);
- ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
- ASSERT(crwhy == CRCREAT);
+ if ((filemode & FCREAT) != 0) {
+ ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
+ ASSERT(crwhy == CRCREAT);
+ operation = CREATE;
+ } else {
+ ASSERT(filemode == (FREAD | FWRITE | FOFFMAX));
+ ASSERT(crwhy == 0);
+ operation = LOOKUP;
+ }
ASSERT(umask == 0);
if (td->td_proc->p_fd->fd_rdir == NULL)
@@ -169,7 +169,10 @@ zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
if (td->td_proc->p_fd->fd_cdir == NULL)
td->td_proc->p_fd->fd_cdir = rootvnode;
- NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td);
+ if (startvp != NULL)
+ vref(startvp);
+ NDINIT_ATVP(&nd, operation, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pnamep,
+ startvp, td);
error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error == 0) {
@@ -180,6 +183,15 @@ zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
}
return (error);
}
+
+static __inline int
+zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
+ vnode_t **vpp, enum create crwhy, mode_t umask)
+{
+
+ return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
+ umask, NULL, -1));
+}
#define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \
zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
@@ -192,14 +204,16 @@ zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
struct thread *td = curthread;
int error, vfslocked, resid;
- ASSERT(rw == UIO_WRITE);
ASSERT(ioflag == 0);
ASSERT(ulimit == RLIM64_INFINITY);
- ioflag = IO_APPEND | IO_UNIT;
-
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ if (rw == UIO_WRITE) {
+ ioflag = IO_SYNC;
+ VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+ } else {
+ ioflag = IO_DIRECT;
+ }
error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED,
&resid, td);
VFS_UNLOCK_GIANT(vfslocked);
@@ -229,7 +243,7 @@ drop:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
-#define VOP_FSYNC(vp, flag, cr) zfs_vop_fsync((vp), (flag), (cr))
+#define VOP_FSYNC(vp, flag, cr, ct) zfs_vop_fsync((vp), (flag), (cr))
static __inline int
zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
@@ -241,7 +255,7 @@ zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
return (vn_close(vp, flag, cr, curthread));
}
-#define VOP_CLOSE(vp, oflags, count, offset, cr) \
+#define VOP_CLOSE(vp, oflags, count, offset, cr, ct) \
zfs_vop_close((vp), (oflags), (count), (offset), (cr))
static __inline int
@@ -253,7 +267,6 @@ vn_rename(char *from, char *to, enum uio_seg seg)
return (kern_rename(curthread, from, to, seg));
}
-enum rm { RMFILE };
static __inline int
vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
{
diff --git a/sys/cddl/compat/opensolaris/sys/zone.h b/sys/cddl/compat/opensolaris/sys/zone.h
index 2e47eb17b44e..d761310a1a81 100644
--- a/sys/cddl/compat/opensolaris/sys/zone.h
+++ b/sys/cddl/compat/opensolaris/sys/zone.h
@@ -38,9 +38,9 @@
*/
/*
- * Is process in the global zone?
+ * Is thread in the global zone?
*/
-#define INGLOBALZONE(p) (!jailed((p)->p_ucred))
+#define INGLOBALZONE(thread) (!jailed((thread)->td_ucred))
/*
* Attach the given dataset to the given jail.
@@ -61,8 +61,6 @@ extern int zone_dataset_visible(const char *, int *);
#define GLOBAL_ZONEID 0
-extern int getzoneid(void);
-
#endif /* _KERNEL */
#endif /* !_OPENSOLARIS_SYS_ZONE_H_ */
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
index 2f32e7a8a78a..e6b678079635 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,100 +19,245 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
-#include <sys/acl.h>
#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
#if defined(_KERNEL)
+#include <sys/kmem.h>
#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
#include <sys/debug.h>
#else
#include <errno.h>
#include <stdlib.h>
+#include <stddef.h>
#include <strings.h>
+#include <unistd.h>
#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
#define ASSERT assert
#endif
+#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+ ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
+#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
+#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
+#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
+
+#define ACL_WRITE_OWNER_SET_DENY 0x0000010
+#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
+#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
+#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
+
+#define ACL_DELETE_SET_DENY 0x0000100
+#define ACL_DELETE_SET_ALLOW 0x0000200
+#define ACL_DELETE_ERR_DENY 0x0000400
+#define ACL_DELETE_ERR_ALLOW 0x0000800
+
+#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
+#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
+#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
+#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
+
+#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
+#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
+#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
+#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
+
+#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
+#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
+#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
+#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
+
+#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
+#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
+#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
+#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
+
+
+#define ACE_VALID_MASK_BITS (\
+ ACE_READ_DATA | \
+ ACE_LIST_DIRECTORY | \
+ ACE_WRITE_DATA | \
+ ACE_ADD_FILE | \
+ ACE_APPEND_DATA | \
+ ACE_ADD_SUBDIRECTORY | \
+ ACE_READ_NAMED_ATTRS | \
+ ACE_WRITE_NAMED_ATTRS | \
+ ACE_EXECUTE | \
+ ACE_DELETE_CHILD | \
+ ACE_READ_ATTRIBUTES | \
+ ACE_WRITE_ATTRIBUTES | \
+ ACE_DELETE | \
+ ACE_READ_ACL | \
+ ACE_WRITE_ACL | \
+ ACE_WRITE_OWNER | \
+ ACE_SYNCHRONIZE)
+
+#define ACE_MASK_UNDEFINED 0x80000000
+
+#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+ ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+ ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+ ace_unused,
+ ace_user_obj,
+ ace_user,
+ ace_group, /* includes GROUP and GROUP_OBJ */
+ ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+ uid_t key;
+ avl_node_t avl;
+ uint32_t mask;
+ uint32_t allowed;
+ uint32_t denied;
+ int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+ acevals_t user_obj;
+ avl_tree_t user;
+ int numusers;
+ acevals_t group_obj;
+ avl_tree_t group;
+ int numgroups;
+ acevals_t other_obj;
+ uint32_t acl_mask;
+ int hasmask;
+ int dfacl_flag;
+ ace_to_aent_state_t state;
+ int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
ace_t trivial_acl[] = {
- {-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
+ {(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
+ {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
- {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE},
- {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
+ {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
+ ACE_ACCESS_DENIED_ACE_TYPE},
+ {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
+ ACE_ACCESS_ALLOWED_ACE_TYPE},
+ {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ {(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
};
void
-adjust_ace_pair(ace_t *pair, mode_t mode)
+adjust_ace_pair_common(void *pair, size_t access_off,
+ size_t pairsize, mode_t mode)
{
+ char *datap = (char *)pair;
+ uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
+ uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
+ access_off);
if (mode & S_IROTH)
- pair[1].a_access_mask |= ACE_READ_DATA;
+ *amask1 |= ACE_READ_DATA;
else
- pair[0].a_access_mask |= ACE_READ_DATA;
+ *amask0 |= ACE_READ_DATA;
if (mode & S_IWOTH)
- pair[1].a_access_mask |=
- ACE_WRITE_DATA|ACE_APPEND_DATA;
+ *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
else
- pair[0].a_access_mask |=
- ACE_WRITE_DATA|ACE_APPEND_DATA;
+ *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
if (mode & S_IXOTH)
- pair[1].a_access_mask |= ACE_EXECUTE;
+ *amask1 |= ACE_EXECUTE;
else
- pair[0].a_access_mask |= ACE_EXECUTE;
+ *amask0 |= ACE_EXECUTE;
+}
+
+void
+adjust_ace_pair(ace_t *pair, mode_t mode)
+{
+ adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask),
+ sizeof (ace_t), mode);
+}
+
+static void
+ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny)
+{
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE)
+ *allow = B_TRUE;
+ else if (type == ACE_ACCESS_DENIED_ACE_TYPE)
+ *deny = B_TRUE;
}
/*
* ace_trivial:
* determine whether an ace_t acl is trivial
*
- * Trivialness implys that the acl is composed of only
+ * Trivialness implies that the acl is composed of only
* owner, group, everyone entries. ACL can't
* have read_acl denied, and write_owner/write_acl/write_attributes
* can only be owner@ entry.
*/
int
-ace_trivial(ace_t *acep, int aclcnt)
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
{
- int i;
- int owner_seen = 0;
- int group_seen = 0;
- int everyone_seen = 0;
+ boolean_t owner_allow = B_FALSE;
+ boolean_t group_allow = B_FALSE;
+ boolean_t everyone_allow = B_FALSE;
+ boolean_t owner_deny = B_FALSE;
+ boolean_t group_deny = B_FALSE;
+ boolean_t everyone_deny = B_FALSE;
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
- for (i = 0; i != aclcnt; i++) {
- switch (acep[i].a_flags & 0xf040) {
+ while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
+ switch (flags & ACE_TYPE_FLAGS) {
case ACE_OWNER:
- if (group_seen || everyone_seen)
+ if (group_allow || group_deny || everyone_allow ||
+ everyone_deny)
return (1);
- owner_seen++;
+ ace_allow_deny_helper(type, &owner_allow, &owner_deny);
break;
case ACE_GROUP|ACE_IDENTIFIER_GROUP:
- if (everyone_seen || owner_seen == 0)
+ if (everyone_allow || everyone_deny &&
+ (!owner_allow && !owner_deny))
return (1);
- group_seen++;
+ ace_allow_deny_helper(type, &group_allow, &group_deny);
break;
case ACE_EVERYONE:
- if (owner_seen == 0 || group_seen == 0)
+ if (!owner_allow && !owner_deny &&
+ !group_allow && !group_deny)
return (1);
- everyone_seen++;
+ ace_allow_deny_helper(type,
+ &everyone_allow, &everyone_deny);
break;
default:
return (1);
}
- if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+ if (flags & (ACE_FILE_INHERIT_ACE|
ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
ACE_INHERIT_ONLY_ACE))
return (1);
@@ -124,27 +268,49 @@ ace_trivial(ace_t *acep, int aclcnt)
* Don't allow anybody to deny reading basic
* attributes or a files ACL.
*/
- if ((acep[i].a_access_mask &
- (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE))
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
return (1);
/*
* Allow on owner@ to allow
* write_acl/write_owner/write_attributes
*/
- if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
- (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask &
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
(ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
return (1);
+
}
- if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0))
- return (1);
+ if (!owner_allow || !owner_deny || !group_allow || !group_deny ||
+ !everyone_allow || !everyone_deny)
+ return (1);
return (0);
}
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+ uint16_t *type, uint32_t *mask)
+{
+ ace_t *acep = datap;
+
+ if (cookie >= aclcnt)
+ return (0);
+
+ *flags = acep[cookie].a_flags;
+ *type = acep[cookie].a_type;
+ *mask = acep[cookie++].a_access_mask;
+
+ return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
/*
* Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
@@ -171,8 +337,8 @@ ksort(caddr_t v, int n, int s, int (*f)())
for (g = n / 2; g > 0; g /= 2) {
for (i = g; i < n; i++) {
for (j = i - g; j >= 0 &&
- (*f)(v + j * s, v + (j + g) * s) == 1;
- j -= g) {
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
p1 = (void *)(v + j * s);
p2 = (void *)(v + (j + g) * s);
for (ii = 0; ii < s / 4; ii++) {
@@ -215,3 +381,1347 @@ cmp2acls(void *a, void *b)
/* Totally equal */
return (0);
}
+
+/*ARGSUSED*/
+static void *
+cacl_realloc(void *ptr, size_t size, size_t new_size)
+{
+#if defined(_KERNEL)
+ void *tmp;
+
+ tmp = kmem_alloc(new_size, KM_SLEEP);
+ (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size);
+ kmem_free(ptr, size);
+ return (tmp);
+#else
+ return (realloc(ptr, new_size));
+#endif
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+#if defined(_KERNEL)
+ *ptr = kmem_zalloc(size, KM_SLEEP);
+ return (0);
+#else
+ *ptr = calloc(1, size);
+ if (*ptr == NULL)
+ return (errno);
+
+ return (0);
+#endif
+}
+
+/*ARGSUSED*/
+static void
+cacl_free(void *ptr, size_t size)
+{
+#if defined(_KERNEL)
+ kmem_free(ptr, size);
+#else
+ free(ptr);
+#endif
+}
+
+acl_t *
+acl_alloc(enum acl_type type)
+{
+ acl_t *aclp;
+
+ if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+ return (NULL);
+
+ aclp->acl_aclp = NULL;
+ aclp->acl_cnt = 0;
+
+ switch (type) {
+ case ACE_T:
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ break;
+ case ACLENT_T:
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ break;
+ default:
+ acl_free(aclp);
+ aclp = NULL;
+ }
+ return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+ int acl_size;
+
+ if (aclp == NULL)
+ return;
+
+ if (aclp->acl_aclp) {
+ acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+ cacl_free(aclp->acl_aclp, acl_size);
+ }
+
+ cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+ uint32_t access_mask = 0;
+ int acl_produce;
+ int synchronize_set = 0, write_owner_set = 0;
+ int delete_set = 0, write_attrs_set = 0;
+ int read_named_set = 0, write_named_set = 0;
+
+ acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+ if (isallow) {
+ synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+ write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+ delete_set = ACL_DELETE_SET_ALLOW;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ } else {
+
+ synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+ write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+ delete_set = ACL_DELETE_SET_DENY;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ else
+ /*
+ * If the entity is not the owner and does not
+ * have write permissions ACE_WRITE_ATTRIBUTES will
+ * always go in the DENY ACE.
+ */
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ }
+
+ if (acl_produce & synchronize_set)
+ access_mask |= ACE_SYNCHRONIZE;
+ if (acl_produce & write_owner_set)
+ access_mask |= ACE_WRITE_OWNER;
+ if (acl_produce & delete_set)
+ access_mask |= ACE_DELETE;
+ if (acl_produce & write_attrs_set)
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ if (acl_produce & read_named_set)
+ access_mask |= ACE_READ_NAMED_ATTRS;
+ if (acl_produce & write_named_set)
+ access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+ return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow)
+{
+ uint32_t access = 0;
+ int haswriteperm = 0;
+ int hasreadperm = 0;
+
+ if (isallow) {
+ haswriteperm = (mode & S_IWOTH);
+ hasreadperm = (mode & S_IROTH);
+ } else {
+ haswriteperm = !(mode & S_IWOTH);
+ hasreadperm = !(mode & S_IROTH);
+ }
+
+ /*
+ * The following call takes care of correctly setting the following
+ * mask bits in the access_mask:
+ * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+ * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+ */
+ access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+ if (isallow) {
+ access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+ if (isowner)
+ access |= ACE_WRITE_ACL;
+ } else {
+ if (! isowner)
+ access |= ACE_WRITE_ACL;
+ }
+
+ /* read */
+ if (mode & S_IROTH) {
+ access |= ACE_READ_DATA;
+ }
+ /* write */
+ if (mode & S_IWOTH) {
+ access |= ACE_WRITE_DATA |
+ ACE_APPEND_DATA;
+ if (isdir)
+ access |= ACE_DELETE_CHILD;
+ }
+ /* exec */
+ if (mode & 01) {
+ access |= ACE_EXECUTE;
+ }
+
+ return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+ (void) memcpy(deny, allow, sizeof (ace_t));
+
+ deny->a_who = allow->a_who;
+
+ deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+ deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+ ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_WRITE_NAMED_ATTRS);
+ deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+ ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+ B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's. Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+ int *hasmask, mode_t *mask,
+ int *numuser, int *numgroup, int *needsort)
+{
+ int error = 0;
+ int i;
+ int curtype = 0;
+
+ *hasmask = 0;
+ *mask = 07;
+ *needsort = 0;
+ *numuser = 0;
+ *numgroup = 0;
+
+ for (i = 0; i < n; i++) {
+ if (aclent[i].a_type < curtype)
+ *needsort = 1;
+ else if (aclent[i].a_type > curtype)
+ curtype = aclent[i].a_type;
+ if (aclent[i].a_type & USER)
+ (*numuser)++;
+ if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+ (*numgroup)++;
+ if (aclent[i].a_type & CLASS_OBJ) {
+ if (*hasmask) {
+ error = EINVAL;
+ goto out;
+ } else {
+ *hasmask = 1;
+ *mask = aclent[i].a_perm;
+ }
+ }
+ }
+
+ if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+ int error = 0;
+ mode_t mask;
+ int numuser, numgroup, needsort;
+ int resultsize = 0;
+ int i, groupi = 0, skip;
+ ace_t *acep, *result = NULL;
+ int hasmask;
+
+ error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+ &numuser, &numgroup, &needsort);
+ if (error != 0)
+ goto out;
+
+ /* allow + deny for each aclent */
+ resultsize = n * 2;
+ if (hasmask) {
+ /*
+ * stick extra deny on the group_obj and on each
+ * user|group for the mask (the group_obj was added
+ * into the count for numgroup)
+ */
+ resultsize += numuser + numgroup;
+ /* ... and don't count the mask itself */
+ resultsize -= 2;
+ }
+
+ /* sort the source if necessary */
+ if (needsort)
+ ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+ if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+ goto out;
+
+ acep = result;
+
+ for (i = 0; i < n; i++) {
+ /*
+ * don't process CLASS_OBJ (mask); mask was grabbed in
+ * ln_aent_preprocess()
+ */
+ if (aclent[i].a_type & CLASS_OBJ)
+ continue;
+
+ /* If we need an ACL_MASK emulator, prepend it now */
+ if ((hasmask) &&
+ (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+ acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ acep->a_flags = 0;
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |=
+ (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ }
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+ /*
+ * Set the access mask for the prepended deny
+ * ace. To do this, we invert the mask (found
+ * in ln_aent_preprocess()) then convert it to an
+ * DENY ace access_mask.
+ */
+ acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+ isdir, 0, 0);
+ acep += 1;
+ }
+
+ /* handle a_perm -> access_mask */
+ acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+ isdir, aclent[i].a_type & USER_OBJ, 1);
+
+ /* emulate a default aclent */
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+
+ /*
+ * handle a_perm and a_id
+ *
+ * this must be done last, since it involves the
+ * corresponding deny aces, which are handled
+ * differently for each different a_type.
+ */
+ if (aclent[i].a_type & USER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_OWNER;
+ ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+ acep += 2;
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_GROUP;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ }
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ /*
+ * Set the corresponding deny for the group ace.
+ *
+ * The deny aces go after all of the groups, unlike
+ * everything else, where they immediately follow
+ * the allow ace.
+ *
+ * We calculate "skip", the number of slots to
+ * skip ahead for the deny ace, here.
+ *
+ * The pattern is:
+ * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+ * thus, skip is
+ * (2 * numgroup) - 1 - groupi
+ * (2 * numgroup) to account for MD + A
+ * - 1 to account for the fact that we're on the
+ * access (A), not the mask (MD)
+ * - groupi to account for the fact that we have
+ * passed up groupi number of MD's.
+ */
+ skip = (2 * numgroup) - 1 - groupi;
+ ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+ /*
+ * If we just did the last group, skip acep past
+ * all of the denies; else, just move ahead one.
+ */
+ if (++groupi >= numgroup)
+ acep += numgroup + 1;
+ else
+ acep += 1;
+ } else if (aclent[i].a_type & OTHER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_EVERYONE;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ *acepp = result;
+ *rescount = resultsize;
+
+out:
+ if (error != 0) {
+ if ((result != NULL) && (resultsize > 0)) {
+ cacl_free(result, resultsize * sizeof (ace_t));
+ }
+ }
+
+ return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir,
+ ace_t **retacep, int *retacecnt)
+{
+ ace_t *acep;
+ ace_t *dfacep;
+ int acecnt = 0;
+ int dfacecnt = 0;
+ int dfaclstart = 0;
+ int dfaclcnt = 0;
+ aclent_t *aclp;
+ int i;
+ int error;
+ int acesz, dfacesz;
+
+ ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+ for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ if (i < aclcnt) {
+ dfaclstart = i;
+ dfaclcnt = aclcnt - i;
+ }
+
+ if (dfaclcnt && isdir == 0) {
+ return (EINVAL);
+ }
+
+ error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
+ if (error)
+ return (error);
+
+ if (dfaclcnt) {
+ error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+ &dfacep, &dfacecnt, isdir);
+ if (error) {
+ if (acep) {
+ cacl_free(acep, acecnt * sizeof (ace_t));
+ }
+ return (error);
+ }
+ }
+
+ if (dfacecnt != 0) {
+ acesz = sizeof (ace_t) * acecnt;
+ dfacesz = sizeof (ace_t) * dfacecnt;
+ acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+ if (acep == NULL)
+ return (ENOMEM);
+ if (dfaclcnt) {
+ (void) memcpy(acep + acecnt, dfacep, dfacesz);
+ }
+ }
+ if (dfaclcnt)
+ cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+ *retacecnt = acecnt + dfacecnt;
+ *retacep = acep;
+ return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t mask, o_mode_t *modep, int isdir)
+{
+ int error = 0;
+ o_mode_t mode = 0;
+ uint32_t bits, wantbits;
+
+ /* read */
+ if (mask & ACE_READ_DATA)
+ mode |= S_IROTH;
+
+ /* write */
+ wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+ if (isdir)
+ wantbits |= ACE_DELETE_CHILD;
+ bits = mask & wantbits;
+ if (bits != 0) {
+ if (bits != wantbits) {
+ error = ENOTSUP;
+ goto out;
+ }
+ mode |= S_IWOTH;
+ }
+
+ /* exec */
+ if (mask & ACE_EXECUTE) {
+ mode |= S_IXOTH;
+ }
+
+ *modep = mode;
+
+out:
+ return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+ bzero(vals, sizeof (*vals));
+ vals->allowed = ACE_MASK_UNDEFINED;
+ vals->denied = ACE_MASK_UNDEFINED;
+ vals->mask = ACE_MASK_UNDEFINED;
+ vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+ acevals_init(&al->user_obj, 0);
+ acevals_init(&al->group_obj, 0);
+ acevals_init(&al->other_obj, 0);
+ al->numusers = 0;
+ al->numgroups = 0;
+ al->acl_mask = 0;
+ al->hasmask = 0;
+ al->state = ace_unused;
+ al->seen = 0;
+ al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+ acevals_t key, *rc;
+ avl_index_t where;
+
+ key.key = ace->a_who;
+ rc = avl_find(avl, &key, &where);
+ if (rc != NULL)
+ return (rc);
+
+ /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+ if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+ return (NULL);
+
+ acevals_init(rc, ace->a_who);
+ avl_insert(avl, rc, where);
+ (*num)++;
+
+ return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+ int set_deny, err_deny;
+ int set_allow, err_allow;
+ int acl_consume;
+ int haswriteperm, hasreadperm;
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+ } else {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+ }
+
+ acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+ ACL_DELETE_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY |
+ ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+ ACL_WRITE_NAMED_WRITER_ERR_DENY |
+ ACL_READ_NAMED_READER_ERR_DENY);
+
+ if (mask_bit == ACE_SYNCHRONIZE) {
+ set_deny = ACL_SYNCHRONIZE_SET_DENY;
+ err_deny = ACL_SYNCHRONIZE_ERR_DENY;
+ set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+ err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_OWNER) {
+ set_deny = ACL_WRITE_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+ } else if (mask_bit == ACE_DELETE) {
+ set_deny = ACL_DELETE_SET_DENY;
+ err_deny = ACL_DELETE_ERR_DENY;
+ set_allow = ACL_DELETE_SET_ALLOW;
+ err_allow = ACL_DELETE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+ if (isowner) {
+ set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+ } else if (haswriteperm) {
+ set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+ } else {
+ if ((acep->a_access_mask & mask_bit) &&
+ (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+ if (!hasreadperm)
+ return (0);
+
+ set_deny = ACL_READ_NAMED_READER_SET_DENY;
+ err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+ set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+ err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+ if (!haswriteperm)
+ return (0);
+
+ set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+ } else {
+ return (EINVAL);
+ }
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ if (acl_consume & set_deny) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_deny) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ } else {
+ /* ACE_ACCESS_ALLOWED_ACE_TYPE */
+ if (acl_consume & set_allow) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_allow) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+ int error = 0;
+ int isowner;
+
+ /* only ALLOW or DENY */
+ if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid flags */
+ if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* some flags are illegal */
+ if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+ ACE_FAILED_ACCESS_ACE_FLAG |
+ ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid masks */
+ if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ isowner = 1;
+ } else {
+ isowner = 0;
+ }
+
+ error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_DELETE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ /* more detailed checking of masks */
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+ (acep->a_access_mask & ACE_APPEND_DATA)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+ /* ACL enforcement */
+ if ((acep->a_access_mask & ACE_READ_ACL) &&
+ (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (acep->a_access_mask & ACE_WRITE_ACL) {
+ if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+ (isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (! isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+out:
+ return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir)
+{
+ /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+ (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+ return (ENOTSUP);
+ }
+
+ return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+ uid_t owner, gid_t group, int isdir)
+{
+ int error;
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ if (vals->allowed != (vals->denied ^ flips)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+ (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ dest->a_type = vals->aent_type;
+ if (dest->a_type & (USER | GROUP)) {
+ dest->a_id = vals->key;
+ } else if (dest->a_type & USER_OBJ) {
+ dest->a_id = owner;
+ } else if (dest->a_type & GROUP_OBJ) {
+ dest->a_id = group;
+ } else if (dest->a_type & OTHER_OBJ) {
+ dest->a_id = 0;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+ uid_t owner, gid_t group, int isdir)
+{
+ int error = 0;
+ aclent_t *aent, *result = NULL;
+ acevals_t *vals;
+ int resultcount;
+
+ if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+ (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ resultcount = 3 + list->numusers + list->numgroups;
+ /*
+ * This must be the same condition as below, when we add the CLASS_OBJ
+ * (aka ACL mask)
+ */
+ if ((list->hasmask) || (! list->dfacl_flag))
+ resultcount += 1;
+
+ if (cacl_malloc((void **)&result,
+ resultcount * sizeof (aclent_t)) != 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ aent = result;
+
+ /* USER_OBJ */
+ if (!(list->user_obj.aent_type & USER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+ isdir);
+
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* USER */
+ vals = NULL;
+ for (vals = avl_first(&list->user); vals != NULL;
+ vals = AVL_NEXT(&list->user, vals)) {
+ if (!(vals->aent_type & USER)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /* GROUP_OBJ */
+ if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* GROUP */
+ vals = NULL;
+ for (vals = avl_first(&list->group); vals != NULL;
+ vals = AVL_NEXT(&list->group, vals)) {
+ if (!(vals->aent_type & GROUP)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /*
+ * CLASS_OBJ (aka ACL_MASK)
+ *
+ * An ACL_MASK is not fabricated if the ACL is a default ACL.
+ * This is to follow UFS's behavior.
+ */
+ if ((list->hasmask) || (! list->dfacl_flag)) {
+ if (list->hasmask) {
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ error = ace_mask_to_mode(list->acl_mask ^ flips,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ } else {
+ /* fabricate the ACL_MASK from the group permissions */
+ error = ace_mask_to_mode(list->group_obj.allowed,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ }
+ aent->a_id = 0;
+ aent->a_type = CLASS_OBJ | list->dfacl_flag;
+ ++aent;
+ }
+ /* OTHER_OBJ */
+ if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+
+ *aclentp = result;
+ *aclcnt = resultcount;
+
+out:
+ if (error != 0) {
+ if (result != NULL)
+ cacl_free(result, resultcount * sizeof (aclent_t));
+ }
+
+ return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+ acevals_t *node;
+ void *cookie;
+
+ if (al == NULL)
+ return;
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+
+ avl_destroy(&al->user);
+ avl_destroy(&al->group);
+
+ /* free the container itself */
+ cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+ const acevals_t *a = va, *b = vb;
+
+ if (a->key == b->key)
+ return (0);
+
+ if (a->key > b->key)
+ return (1);
+
+ else
+ return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+ aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+ int isdir)
+{
+ int error = 0;
+ ace_t *acep;
+ uint32_t bits;
+ int i;
+ ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+ acevals_t *vals;
+
+ *aclentp = NULL;
+ *aclcnt = 0;
+ *dfaclentp = NULL;
+ *dfaclcnt = 0;
+
+ /* we need at least user_obj, group_obj, and other_obj */
+ if (n < 6) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (ace == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+
+ ace_list_init(normacl, 0);
+
+ error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ ace_list_init(dfacl, ACL_DEFAULT);
+
+ /* process every ace_t... */
+ for (i = 0; i < n; i++) {
+ acep = &ace[i];
+
+ /* rule out certain cases quickly */
+ error = ace_to_aent_legal(acep);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Turn off these bits in order to not have to worry about
+ * them when doing the checks for compliments.
+ */
+ acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+ ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+ ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+ /* see if this should be a regular or default acl */
+ bits = acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE);
+ if (bits != 0) {
+ /* all or nothing on these inherit bits */
+ if (bits != (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl = dfacl;
+ } else {
+ acl = normacl;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ if (acl->state > ace_user_obj) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user_obj;
+ acl->seen |= USER_OBJ;
+ vals = &acl->user_obj;
+ vals->aent_type = USER_OBJ | acl->dfacl_flag;
+ } else if ((acep->a_flags & ACE_EVERYONE)) {
+ acl->state = ace_other_obj;
+ acl->seen |= OTHER_OBJ;
+ vals = &acl->other_obj;
+ vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+ } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+ if (acl->state > ace_group) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_flags & ACE_GROUP)) {
+ acl->seen |= GROUP_OBJ;
+ vals = &acl->group_obj;
+ vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+ } else {
+ acl->seen |= GROUP;
+ vals = acevals_find(acep, &acl->group,
+ &acl->numgroups);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = GROUP | acl->dfacl_flag;
+ }
+ acl->state = ace_group;
+ } else {
+ if (acl->state > ace_user) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user;
+ acl->seen |= USER;
+ vals = acevals_find(acep, &acl->user,
+ &acl->numusers);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = USER | acl->dfacl_flag;
+ }
+
+ if (!(acl->state > ace_unused)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ /* no more than one allowed per aclent_t */
+ if (vals->allowed != ACE_MASK_UNDEFINED) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->allowed = acep->a_access_mask;
+ } else {
+ /*
+ * it's a DENY; if there was a previous DENY, it
+ * must have been an ACL_MASK.
+ */
+ if (vals->denied != ACE_MASK_UNDEFINED) {
+ /* ACL_MASK is for USER and GROUP only */
+ if ((acl->state != ace_user) &&
+ (acl->state != ace_group)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if (! acl->hasmask) {
+ acl->hasmask = 1;
+ acl->acl_mask = vals->denied;
+ /* check for mismatched ACL_MASK emulations */
+ } else if (acl->acl_mask != vals->denied) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->mask = vals->denied;
+ }
+ vals->denied = acep->a_access_mask;
+ }
+ }
+
+ /* done collating; produce the aclent_t lists */
+ if (normacl->state != ace_unused) {
+ error = ace_list_to_aent(normacl, aclentp, aclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+ if (dfacl->state != ace_unused) {
+ error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+out:
+ if (normacl != NULL)
+ ace_list_free(normacl);
+ if (dfacl != NULL)
+ ace_list_free(dfacl);
+
+ return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir,
+ uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+ int error = 0;
+ aclent_t *aclentp, *dfaclentp;
+ int aclcnt, dfaclcnt;
+ int aclsz, dfaclsz;
+
+ error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+ &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+ if (error)
+ return (error);
+
+
+ if (dfaclcnt != 0) {
+ /*
+ * Slap aclentp and dfaclentp into a single array.
+ */
+ aclsz = sizeof (aclent_t) * aclcnt;
+ dfaclsz = sizeof (aclent_t) * dfaclcnt;
+ aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+ if (aclentp != NULL) {
+ (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+ } else {
+ error = ENOMEM;
+ }
+ }
+
+ if (aclentp) {
+ *retaclentp = aclentp;
+ *retaclcnt = aclcnt + dfaclcnt;
+ }
+
+ if (dfaclentp)
+ cacl_free(dfaclentp, dfaclsz);
+
+ return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner,
+ gid_t group)
+{
+ int aclcnt;
+ void *acldata;
+ int error;
+
+ /*
+ * See if we need to translate
+ */
+ if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+ (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACLENT_T))
+ return (0);
+
+ if (target_flavor == -1) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (target_flavor == _ACL_ACE_ENABLED &&
+ aclp->acl_type == ACLENT_T) {
+ error = convert_aent_to_ace(aclp->acl_aclp,
+ aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+
+ } else if (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACE_T) {
+ error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+ isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+ } else {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * replace old acl with newly translated acl
+ */
+ cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+ aclp->acl_aclp = acldata;
+ aclp->acl_cnt = aclcnt;
+ if (target_flavor == _ACL_ACE_ENABLED) {
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ } else {
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ }
+ return (0);
+
+out:
+
+#if !defined(_KERNEL)
+ errno = error;
+ return (-1);
+#else
+ return (error);
+#endif
+}
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
index 2227ad77ea93..84bd04f52fd6 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#ifndef _ACL_ACL_UTILS_H
-#define _ACL_ACL_UTILS_H
+#ifndef _ACL_COMMON_H
+#define _ACL_COMMON_H
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -34,7 +33,7 @@
#include <sys/acl.h>
#include <sys/stat.h>
-#ifdef __cplusplus
+#ifdef __cplusplus
extern "C" {
#endif
@@ -42,15 +41,21 @@ extern ace_t trivial_acl[6];
extern int acltrivial(const char *);
extern void adjust_ace_pair(ace_t *pair, mode_t mode);
+extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
extern int ace_trivial(ace_t *acep, int aclcnt);
+extern int ace_trivial_common(void *, int,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
+ uint32_t *mask));
+extern acl_t *acl_alloc(acl_type_t);
+extern void acl_free(acl_t *aclp);
+extern int acl_translate(acl_t *aclp, int target_flavor,
+ int isdir, uid_t owner, gid_t group);
void ksort(caddr_t v, int n, int s, int (*f)());
int cmp2acls(void *a, void *b);
-
-
-#ifdef __cplusplus
+#ifdef __cplusplus
}
#endif
-#endif /* _ACL_ACL_UTILS_H */
+#endif /* _ACL_COMMON_H */
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
index 2e62aa420d91..6851086c1f96 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,14 +18,13 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
- .ident "%Z%%M% %I% %E% SMI"
-
- .file "%M%"
+ .file "atomic.s"
#define _ASM
#include <sys/asm_linkage.h>
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
index bc7f22acc8d4..57f7d0a47b53 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,18 +18,54 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
- .ident "%Z%%M% %I% %E% SMI"
-
- .file "%M%"
+ .file "atomic.s"
#define _ASM
#include <sys/asm_linkage.h>
+ /*
+ * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_dec_64_nv.
+ */
+ ENTRY(atomic_dec_64)
+ ALTENTRY(atomic_dec_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi // %edi = target address
+ movl (%edi), %eax
+ movl 4(%edi), %edx // %edx:%eax = old value
+1:
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ not %ecx
+ not %ebx // %ecx:%ebx = -1
+ addl %eax, %ebx
+ adcl %edx, %ecx // add in the carry from inc
+ lock
+ cmpxchg8b (%edi) // try to stick it in
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx // return new value
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_dec_64_nv)
+ SET_SIZE(atomic_dec_64)
+
+ /*
+ * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_add_64_nv.
+ */
ENTRY(atomic_add_64)
ALTENTRY(atomic_add_64_nv)
pushl %edi
diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c
index 1fa2236607bf..01aa3cb2fa9d 100644
--- a/sys/cddl/contrib/opensolaris/common/avl/avl.c
+++ b/sys/cddl/contrib/opensolaris/common/avl/avl.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -808,6 +808,64 @@ avl_remove(avl_tree_t *tree, void *data)
} while (parent != NULL);
}
+#define AVL_REINSERT(tree, obj) \
+ avl_remove((tree), (obj)); \
+ avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) <= 0));
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) >= 0));
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
/*
* initialize a new AVL tree
*/
@@ -853,6 +911,12 @@ avl_numnodes(avl_tree_t *tree)
return (tree->avl_numnodes);
}
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes == 0);
+}
#define CHILDBIT (1L)
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
index d3d5bed525ab..9cdc534ffc1b 100644
--- a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,15 +34,17 @@
#if defined(_KERNEL) && !defined(_BOOT)
#include <sys/varargs.h>
+#include <sys/sunddi.h>
#else
#include <stdarg.h>
-#include <strings.h>
+#include <stdlib.h>
+#include <string.h>
#endif
#ifndef offsetof
-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
#endif
-
+#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++
/*
* nvpair.c - Provides kernel & userland interfaces for manipulating
@@ -201,7 +203,7 @@ nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
static void
nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
{
- bzero(priv, sizeof (priv));
+ bzero(priv, sizeof (nvpriv_t));
priv->nvp_nva = nva;
priv->nvp_stat = stat;
@@ -395,6 +397,9 @@ i_validate_type_nelem(data_type_t type, uint_t nelem)
case DATA_TYPE_STRING:
case DATA_TYPE_HRTIME:
case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
if (nelem != 1)
return (EINVAL);
break;
@@ -733,6 +738,11 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
case DATA_TYPE_UINT64:
value_sz = sizeof (uint64_t);
break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ value_sz = sizeof (double);
+ break;
+#endif
case DATA_TYPE_STRING:
if (data == NULL)
value_sz = 0;
@@ -1017,6 +1027,14 @@ nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
}
+#if !defined(_KERNEL)
+int
+nvlist_add_double(nvlist_t *nvl, const char *name, double val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
+}
+#endif
+
int
nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
{
@@ -1123,13 +1141,15 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
curr = NVPAIR2I_NVP(nvp);
/*
- * Ensure that nvp is an valid pointer.
+ * Ensure that nvp is a valid nvpair on this nvlist.
+ * NB: nvp_curr is used only as a hint so that we don't always
+ * have to walk the list to determine if nvp is still on the list.
*/
if (nvp == NULL)
curr = priv->nvp_list;
- else if (priv->nvp_curr == curr)
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
curr = curr->nvi_next;
- else if (nvlist_contains_nvp(nvl, nvp) == 0)
+ else
curr = NULL;
priv->nvp_curr = curr;
@@ -1149,6 +1169,27 @@ nvpair_type(nvpair_t *nvp)
return (NVP_TYPE(nvp));
}
+int
+nvpair_type_is_array(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+
+ if ((type == DATA_TYPE_BYTE_ARRAY) ||
+ (type == DATA_TYPE_UINT8_ARRAY) ||
+ (type == DATA_TYPE_INT16_ARRAY) ||
+ (type == DATA_TYPE_UINT16_ARRAY) ||
+ (type == DATA_TYPE_INT32_ARRAY) ||
+ (type == DATA_TYPE_UINT32_ARRAY) ||
+ (type == DATA_TYPE_INT64_ARRAY) ||
+ (type == DATA_TYPE_UINT64_ARRAY) ||
+ (type == DATA_TYPE_BOOLEAN_ARRAY) ||
+ (type == DATA_TYPE_STRING_ARRAY) ||
+ (type == DATA_TYPE_NVLIST_ARRAY))
+ return (1);
+ return (0);
+
+}
+
static int
nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
{
@@ -1176,6 +1217,9 @@ nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
case DATA_TYPE_INT64:
case DATA_TYPE_UINT64:
case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
if (data == NULL)
return (EINVAL);
bcopy(NVP_VALUE(nvp), data,
@@ -1312,6 +1356,14 @@ nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
}
+#if !defined(_KERNEL)
+int
+nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
int
nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
{
@@ -1446,6 +1498,9 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
case DATA_TYPE_HRTIME:
case DATA_TYPE_STRING:
case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
val = va_arg(ap, void *);
ret = nvlist_lookup_common(nvl, name, type, NULL, val);
break;
@@ -1479,6 +1534,224 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
return (ret);
}
+/*
+ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
+ * returns zero and a pointer to the matching nvpair is returned in '*ret'
+ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
+ * multiple levels of embedded nvlists, with 'sep' as the separator. As an
+ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
+ * "a.d[3].e[1]". This matches the C syntax for array embed (for convience,
+ * code also supports "a.d[3]e[1]" syntax).
+ *
+ * If 'ip' is non-NULL and the last name component is an array, return the
+ * value of the "...[index]" array index in *ip. For an array reference that
+ * is not indexed, *ip will be returned as -1. If there is a syntax error in
+ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
+ * inside the 'name' string where the syntax error was detected.
+ */
+static int
+nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
+ nvpair_t **ret, int *ip, char **ep)
+{
+ nvpair_t *nvp;
+ const char *np;
+ char *sepp;
+ char *idxp, *idxep;
+ nvlist_t **nva;
+ long idx;
+ int n;
+
+ if (ip)
+ *ip = -1; /* not indexed */
+ if (ep)
+ *ep = NULL;
+
+ if ((nvl == NULL) || (name == NULL))
+ return (EINVAL);
+
+ /* step through components of name */
+ for (np = name; np && *np; np = sepp) {
+ /* ensure unique names */
+ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
+ return (ENOTSUP);
+
+ /* skip white space */
+ skip_whitespace(np);
+ if (*np == 0)
+ break;
+
+ /* set 'sepp' to end of current component 'np' */
+ if (sep)
+ sepp = strchr(np, sep);
+ else
+ sepp = NULL;
+
+ /* find start of next "[ index ]..." */
+ idxp = strchr(np, '[');
+
+ /* if sepp comes first, set idxp to NULL */
+ if (sepp && idxp && (sepp < idxp))
+ idxp = NULL;
+
+ /*
+ * At this point 'idxp' is set if there is an index
+ * expected for the current component.
+ */
+ if (idxp) {
+ /* set 'n' to length of current 'np' name component */
+ n = idxp++ - np;
+
+ /* keep sepp up to date for *ep use as we advance */
+ skip_whitespace(idxp);
+ sepp = idxp;
+
+ /* determine the index value */
+#if defined(_KERNEL) && !defined(_BOOT)
+ if (ddi_strtol(idxp, &idxep, 0, &idx))
+ goto fail;
+#else
+ idx = strtol(idxp, &idxep, 0);
+#endif
+ if (idxep == idxp)
+ goto fail;
+
+ /* keep sepp up to date for *ep use as we advance */
+ sepp = idxep;
+
+ /* skip white space index value and check for ']' */
+ skip_whitespace(sepp);
+ if (*sepp++ != ']')
+ goto fail;
+
+ /* for embedded arrays, support C syntax: "a[1].b" */
+ skip_whitespace(sepp);
+ if (sep && (*sepp == sep))
+ sepp++;
+ } else if (sepp) {
+ n = sepp++ - np;
+ } else {
+ n = strlen(np);
+ }
+
+ /* trim trailing whitespace by reducing length of 'np' */
+ if (n == 0)
+ goto fail;
+ for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
+ ;
+ n++;
+
+ /* skip whitespace, and set sepp to NULL if complete */
+ if (sepp) {
+ skip_whitespace(sepp);
+ if (*sepp == 0)
+ sepp = NULL;
+ }
+
+ /*
+ * At this point:
+ * o 'n' is the length of current 'np' component.
+ * o 'idxp' is set if there was an index, and value 'idx'.
+ * o 'sepp' is set to the beginning of the next component,
+ * and set to NULL if we have no more components.
+ *
+ * Search for nvpair with matching component name.
+ */
+ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ /* continue if no match on name */
+ if (strncmp(np, nvpair_name(nvp), n) ||
+ (strlen(nvpair_name(nvp)) != n))
+ continue;
+
+ /* if indexed, verify type is array oriented */
+ if (idxp && !nvpair_type_is_array(nvp))
+ goto fail;
+
+ /*
+ * Full match found, return nvp and idx if this
+ * was the last component.
+ */
+ if (sepp == NULL) {
+ if (ret)
+ *ret = nvp;
+ if (ip && idxp)
+ *ip = (int)idx; /* return index */
+ return (0); /* found */
+ }
+
+ /*
+ * More components: current match must be
+ * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
+ * to support going deeper.
+ */
+ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
+ nvl = EMBEDDED_NVL(nvp);
+ break;
+ } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
+ (void) nvpair_value_nvlist_array(nvp,
+ &nva, (uint_t *)&n);
+ if ((n < 0) || (idx >= n))
+ goto fail;
+ nvl = nva[idx];
+ break;
+ }
+
+ /* type does not support more levels */
+ goto fail;
+ }
+ if (nvp == NULL)
+ goto fail; /* 'name' not found */
+
+ /* search for match of next component in embedded 'nvl' list */
+ }
+
+fail: if (ep && sepp)
+ *ep = sepp;
+ return (EINVAL);
+}
+
+/*
+ * Return pointer to nvpair with specified 'name'.
+ */
+int
+nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
+}
+
+/*
+ * Determine if named nvpair exists in nvlist (use embedded separator of '.'
+ * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed
+ * description.
+ */
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
+ const char *name, nvpair_t **ret, int *ip, char **ep)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
+}
+
+boolean_t
+nvlist_exists(nvlist_t *nvl, const char *name)
+{
+ nvpriv_t *priv;
+ nvpair_t *nvp;
+ i_nvp_t *curr;
+
+ if (name == NULL || nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_FALSE);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
int
nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
{
@@ -1539,6 +1812,14 @@ nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
}
+#if !defined(_KERNEL)
+int
+nvpair_value_double(nvpair_t *nvp, double *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
int
nvpair_value_string(nvpair_t *nvp, char **val)
{
@@ -2676,7 +2957,11 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
*/
ret = xdr_longlong_t(xdr, (void *)buf);
break;
-
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ ret = xdr_double(xdr, (void *)buf);
+ break;
+#endif
case DATA_TYPE_STRING:
ret = xdr_string(xdr, &buf, buflen - 1);
break;
@@ -2782,6 +3067,9 @@ nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
case DATA_TYPE_INT64:
case DATA_TYPE_UINT64:
case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
nvp_sz += 8;
break;
diff --git a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
new file mode 100644
index 000000000000..73cf74a4d159
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
@@ -0,0 +1,2130 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#else
+#include <strings.h>
+#endif /* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep.h>
+#include <sys/u8_textprep_data.h>
+
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define U8_MB_CUR_MAX (4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define U8_MAX_BYTES_UCS2 (3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define U8_STREAM_SAFE_TEXT_MAX (128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define U8_MAX_CHARS_A_SEQ (32)
+#define U8_UPPER_LIMIT_IN_A_SEQ (31)
+
+/* The combining class value for Starter. */
+#define U8_COMBINING_CLASS_STARTER (0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define U8_HANGUL_SYL_FIRST (0xAC00U)
+#define U8_HANGUL_SYL_LAST (0xD7A3U)
+
+#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
+#define U8_HANGUL_JAMO_L_LAST (0x1112U)
+#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
+#define U8_HANGUL_JAMO_V_LAST (0x1175U)
+#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
+#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
+
+#define U8_HANGUL_V_COUNT (21)
+#define U8_HANGUL_VT_COUNT (588)
+#define U8_HANGUL_T_COUNT (28)
+
+#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
+
+#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define U8_HANGUL_JAMO_L(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define U8_HANGUL_JAMO_V(u) \
+ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define U8_HANGUL_JAMO_T(u) \
+ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_JAMO(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_SYLLABLE(u) \
+ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
+ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define U8_DECOMP_BOTH (0xF5U)
+#define U8_DECOMP_CANONICAL (0xF6U)
+
+/* The indicator for 16-bit table. */
+#define U8_16BIT_TABLE_INDICATOR (0x8000U)
+
+/* The following are some convenience macros. */
+#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
+ (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
+ (uint32_t)(b3) & 0x3F;
+
+#define U8_SIMPLE_SWAP(a, b, t) \
+ (t) = (a); \
+ (a) = (b); \
+ (b) = (t);
+
+#define U8_ASCII_TOUPPER(c) \
+ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define U8_ASCII_TOLOWER(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define U8_SWAP_COMB_MARKS(a, b) \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8t[k] = u8s[start[(a)] + k]; \
+ for (k = 0; k < disp[(b)]; k++) \
+ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+ start[(b)] = start[(a)] + disp[(b)]; \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8s[start[(b)] + k] = u8t[k]; \
+ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+ U8_STATE_START = 0,
+ U8_STATE_HANGUL_L = 1,
+ U8_STATE_HANGUL_LV = 2,
+ U8_STATE_HANGUL_LVT = 3,
+ U8_STATE_HANGUL_V = 4,
+ U8_STATE_HANGUL_T = 5,
+ U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
+
+#define I_ U8_ILLEGAL_CHAR
+#define O_ U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
+ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
+ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef I_
+#undef O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* C8 C9 CA CB CC CD CE CF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D8 D9 DA DB DC DD DE DF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E8 E9 EA EB EC ED EE EF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* C8 C9 CA CB CC CD CE CF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D8 D9 DA DB DC DD DE DF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E8 E9 EA EB EC ED EE EF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t **p;
+ uchar_t *s1;
+ uchar_t *s2;
+ uchar_t f;
+ int sz;
+ size_t i;
+ int ret_val;
+ boolean_t second;
+ boolean_t no_need_to_validate_entire;
+ boolean_t check_additional;
+ boolean_t validate_ucs2_range_only;
+
+ if (! u8str)
+ return (0);
+
+ ib = (uchar_t *)u8str;
+ ibtail = ib + n;
+
+ ret_val = 0;
+
+ no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+ check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+ validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+ while (ib < ibtail) {
+ /*
+ * The first byte of a UTF-8 character tells how many
+ * bytes will follow for the character. If the first byte
+ * is an illegal byte value or out of range value, we just
+ * return -1 with an appropriate error number.
+ */
+ sz = u8_number_of_bytes[*ib];
+ if (sz == U8_ILLEGAL_CHAR) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+
+ if (sz == U8_OUT_OF_RANGE_CHAR ||
+ (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+ *errnum = ERANGE;
+ return (-1);
+ }
+
+ /*
+ * If we don't have enough bytes to check on, that's also
+ * an error. As you can see, we give illegal byte sequence
+ * checking higher priority then EINVAL cases.
+ */
+ if ((ibtail - ib) < sz) {
+ *errnum = EINVAL;
+ return (-1);
+ }
+
+ if (sz == 1) {
+ ib++;
+ ret_val++;
+ } else {
+ /*
+ * Check on the multi-byte UTF-8 character. For more
+ * details on this, see comment added for the used
+ * data structures at the beginning of the file.
+ */
+ f = *ib++;
+ ret_val++;
+ second = B_TRUE;
+ for (i = 1; i < sz; i++) {
+ if (second) {
+ if (*ib < u8_valid_min_2nd_byte[f] ||
+ *ib > u8_valid_max_2nd_byte[f]) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ second = B_FALSE;
+ } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ ib++;
+ ret_val++;
+ }
+ }
+
+ if (check_additional) {
+ for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+ s1 = ib - sz;
+ s2 = p[i];
+ while (s1 < ib) {
+ if (*s1 != *s2 || *s2 == '\0')
+ break;
+ s1++;
+ s2++;
+ }
+
+ if (s1 >= ib && *s2 == '\0') {
+ *errnum = EBADF;
+ return (-1);
+ }
+ }
+ }
+
+ if (no_need_to_validate_entire)
+ break;
+ }
+
+ return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+ size_t i;
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ /*
+ * At this point, the only possible values for sz are 2, 3, and 4.
+ * The u8s should point to a vector that is well beyond the size of
+ * 5 bytes.
+ */
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ } else if (sz == 3) {
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ } else {
+ /* This is not possible but just in case as a fallback. */
+ if (is_it_toupper)
+ *u8s = U8_ASCII_TOUPPER(*s);
+ else
+ *u8s = U8_ASCII_TOLOWER(*s);
+ u8s[1] = '\0';
+
+ return (1);
+ }
+ u8s[sz] = '\0';
+
+ /*
+ * Let's find out if we have a corresponding character.
+ */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_case_common_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ if (is_it_toupper) {
+ b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ /* Either there is no match or an error at the table. */
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+ } else {
+ b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+ }
+
+ /*
+ * If i is still zero, that means there is no corresponding character.
+ */
+ if (i == 0)
+ return ((size_t)sz);
+
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+ size_t n2, boolean_t is_it_toupper, int *errnum)
+{
+ int f;
+ int sz1;
+ int sz2;
+ size_t j;
+ size_t i1;
+ size_t i2;
+ uchar_t u8s1[U8_MB_CUR_MAX + 1];
+ uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+ i1 = i2 = 0;
+ while (i1 < n1 && i2 < n2) {
+ /*
+ * Find out what would be the byte length for this UTF-8
+ * character at string s1 and also find out if this is
+ * an illegal start byte or not and if so, issue a proper
+ * error number and yet treat this byte as a character.
+ */
+ sz1 = u8_number_of_bytes[*s1];
+ if (sz1 < 0) {
+ *errnum = EILSEQ;
+ sz1 = 1;
+ }
+
+ /*
+ * For 7-bit ASCII characters mainly, we do a quick case
+ * conversion right at here.
+ *
+ * If we don't have enough bytes for this character, issue
+ * an EINVAL error and use what are available.
+ *
+ * If we have enough bytes, find out if there is
+ * a corresponding uppercase character and if so, copy over
+ * the bytes for a comparison later. If there is no
+ * corresponding uppercase character, then, use what we have
+ * for the comparison.
+ */
+ if (sz1 == 1) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ s1++;
+ u8s1[1] = '\0';
+ } else if ((i1 + sz1) > n1) {
+ *errnum = EINVAL;
+ for (j = 0; (i1 + j) < n1; )
+ u8s1[j++] = *s1++;
+ u8s1[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+ s1 += sz1;
+ }
+
+ /* Do the same for the string s2. */
+ sz2 = u8_number_of_bytes[*s2];
+ if (sz2 < 0) {
+ *errnum = EILSEQ;
+ sz2 = 1;
+ }
+
+ if (sz2 == 1) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ s2++;
+ u8s2[1] = '\0';
+ } else if ((i2 + sz2) > n2) {
+ *errnum = EINVAL;
+ for (j = 0; (i2 + j) < n2; )
+ u8s2[j++] = *s2++;
+ u8s2[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+ s2 += sz2;
+ }
+
+ /* Now compare the two characters. */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ f = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (f != 0)
+ return (f);
+ }
+
+ /*
+ * They were the same. Let's move on to the next
+ * characters then.
+ */
+ i1 += sz1;
+ i2 += sz2;
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one of the two ends, that means the other string
+ * has something which then the fact can be used to determine
+ * the return value.
+ */
+ if (i1 >= n1) {
+ if (i2 >= n2)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b4 = 0;
+
+ if (sz == 1 || sz > 4)
+ return (0);
+
+ if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ }
+
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+ if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+ boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+ size_t i;
+ uint32_t u1;
+
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ u8s[2] = '\0';
+ } else if (sz == 3) {
+ /* Convert it to a Unicode scalar value. */
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+ /*
+ * If this is a Hangul syllable, we decompose it into
+ * a leading consonant, a vowel, and an optional trailing
+ * consonant and then return.
+ */
+ if (U8_HANGUL_SYLLABLE(u1)) {
+ u1 -= U8_HANGUL_SYL_FIRST;
+
+ b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+ b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+ / U8_HANGUL_T_COUNT;
+ b3 = u1 % U8_HANGUL_T_COUNT;
+
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+ if (b3) {
+ b3 += U8_HANGUL_JAMO_T_FIRST;
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+ u8s[9] = '\0';
+ *state = U8_STATE_HANGUL_LVT;
+ return (9);
+ }
+
+ u8s[6] = '\0';
+ *state = U8_STATE_HANGUL_LV;
+ return (6);
+ }
+
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ u8s[3] = '\0';
+
+ /*
+ * If this is a Hangul Jamo, we know there is nothing
+ * further that we can decompose.
+ */
+ if (U8_HANGUL_JAMO_L(u1)) {
+ *state = U8_STATE_HANGUL_L;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_V(u1)) {
+ if (*state == U8_STATE_HANGUL_L)
+ *state = U8_STATE_HANGUL_LV;
+ else
+ *state = U8_STATE_HANGUL_V;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_T(u1)) {
+ if (*state == U8_STATE_HANGUL_LV)
+ *state = U8_STATE_HANGUL_LVT;
+ else
+ *state = U8_STATE_HANGUL_T;
+ return (3);
+ }
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ u8s[4] = '\0';
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ u8s[0] = s[0];
+ u8s[1] = '\0';
+ *state = U8_STATE_START;
+ return (1);
+ }
+
+ /*
+ * At this point, this rountine does not know what it would get.
+ * The caller should sort it out if the state isn't a Hangul one.
+ */
+ *state = U8_STATE_START;
+
+ /* Try to find matching decomposition mapping byte sequence. */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_decomp_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ /*
+ * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+ * which is 0x8000, this means we couldn't fit the mappings into
+ * the cardinality of a unsigned byte.
+ */
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ /* This also means there wasn't any matching decomposition. */
+ if (start_id >= end_id)
+ return ((size_t)sz);
+
+ /*
+ * The final table for decomposition mappings has three types of
+ * byte sequences depending on whether a mapping is for compatibility
+ * decomposition, canonical decomposition, or both like the following:
+ *
+ * (1) Compatibility decomposition mappings:
+ *
+ * +---+---+-...-+---+
+ * | B0| B1| ... | Bm|
+ * +---+---+-...-+---+
+ *
+ * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+ *
+ * (2) Canonical decomposition mappings:
+ *
+ * +---+---+---+-...-+---+
+ * | T | b0| b1| ... | bn|
+ * +---+---+---+-...-+---+
+ *
+ * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+ *
+ * (3) Both mappings:
+ *
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ *
+ * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+ * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+ * compatibility mapping bytes.
+ *
+ * Note that compatibility decomposition means doing recursive
+ * decompositions using both compatibility decomposition mappings and
+ * canonical decomposition mappings. On the other hand, canonical
+ * decomposition means doing recursive decompositions using only
+ * canonical decomposition mappings. Since the table we have has gone
+ * through the recursions already, we do not need to do so during
+ * runtime, i.e., the table has been completely flattened out
+ * already.
+ */
+
+ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+ /* Get the type, T, of the byte sequence. */
+ b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+ /*
+ * If necessary, adjust start_id, end_id, or both. Note that if
+ * this is compatibility decomposition mapping, there is no
+ * adjustment.
+ */
+ if (canonical_decomposition) {
+ /* Is the mapping only for compatibility decomposition? */
+ if (b1 < U8_DECOMP_BOTH)
+ return ((size_t)sz);
+
+ start_id++;
+
+ if (b1 == U8_DECOMP_BOTH) {
+ end_id = start_id +
+ u8_decomp_final_tbl[uv][b3_base + start_id];
+ start_id++;
+ }
+ } else {
+ /*
+ * Unless this is a compatibility decomposition mapping,
+ * we adjust the start_id.
+ */
+ if (b1 == U8_DECOMP_BOTH) {
+ start_id++;
+ start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+ } else if (b1 == U8_DECOMP_CANONICAL) {
+ start_id++;
+ }
+ }
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ if (sz == 1) {
+ b4 = s[0];
+ } else if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ return (NULL);
+ }
+
+ b1 = u8_composition_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b2 = u8_composition_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ if (start_id >= end_id)
+ return (NULL);
+
+ b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+ return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+ uchar_t my_comb_class;
+ size_t i;
+
+ my_comb_class = comb_class[last];
+ for (i = 1; i < last; i++)
+ if (comb_class[i] >= my_comb_class ||
+ comb_class[i] == U8_COMBINING_CLASS_STARTER)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+ uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+ uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc[U8_MB_CUR_MAX];
+ uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+ size_t saved_marks_count;
+ uchar_t *p;
+ uchar_t *saved_p;
+ uchar_t *q;
+ size_t i;
+ size_t saved_i;
+ size_t j;
+ size_t k;
+ size_t l;
+ size_t C;
+ size_t saved_l;
+ size_t size;
+ uint32_t u1;
+ uint32_t u2;
+ boolean_t match_not_found = B_TRUE;
+
+ /*
+ * This should never happen unless the callers are doing some strange
+ * and unexpected things.
+ *
+ * The "last" is the index pointing to the last character not last + 1.
+ */
+ if (last >= U8_MAX_CHARS_A_SEQ)
+ last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+ for (i = l = 0; i <= last; i++) {
+ /*
+ * The last or any non-Starters at the beginning, we don't
+ * have any chance to do composition and so we just copy them
+ * to the temporary buffer.
+ */
+ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+ p = s + start[i];
+ size = disp[i];
+ for (k = 0; k < size; k++)
+ t[l++] = *p++;
+ continue;
+ }
+
+ /*
+ * If this could be a start of Hangul Jamos, then, we try to
+ * conjoin them.
+ */
+ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+ s[start[i] + 1], s[start[i] + 2]);
+ U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+ s[start[i] + 4], s[start[i] + 5]);
+
+ if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+ u1 -= U8_HANGUL_JAMO_L_FIRST;
+ u2 -= U8_HANGUL_JAMO_V_FIRST;
+ u1 = U8_HANGUL_SYL_FIRST +
+ (u1 * U8_HANGUL_V_COUNT + u2) *
+ U8_HANGUL_T_COUNT;
+
+ i += 2;
+ if (i <= last) {
+ U8_PUT_3BYTES_INTO_UTF32(u2,
+ s[start[i]], s[start[i] + 1],
+ s[start[i] + 2]);
+
+ if (U8_HANGUL_JAMO_T(u2)) {
+ u1 += u2 -
+ U8_HANGUL_JAMO_T_FIRST;
+ i++;
+ }
+ }
+
+ U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+ i--;
+ l += 3;
+ continue;
+ }
+ }
+
+ /*
+ * Let's then find out if this Starter has composition
+ * mapping.
+ */
+ p = find_composition_start(uv, s + start[i], disp[i]);
+ if (p == NULL)
+ goto SAVE_THE_CHAR;
+
+ /*
+ * We have a Starter with composition mapping and the next
+ * character is a non-Starter. Let's try to find out if
+ * we can do composition.
+ */
+
+ saved_p = p;
+ saved_i = i;
+ saved_l = l;
+ saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+ q = s + start[++i];
+ size = disp[i];
+
+ /*
+ * The next for() loop compares the non-Starter pointed by
+ * 'q' with the possible (joinable) characters pointed by 'p'.
+ *
+ * The composition final table entry pointed by the 'p'
+ * looks like the following:
+ *
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ *
+ * where C is the count byte indicating the number of
+ * mapping pairs where each pair would be look like
+ * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+ * character of a canonical decomposition and the B0-Bm are
+ * the bytes of a matching composite character. The F is
+ * a filler byte after each character as the separator.
+ */
+
+ match_not_found = B_TRUE;
+
+ for (C = *p++; C > 0; C--) {
+ for (k = 0; k < size; p++, k++)
+ if (*p != q[k])
+ break;
+
+ /* Have we found it? */
+ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ t[l++] = *p;
+
+ break;
+ }
+
+ /* We didn't find; skip to the next pair. */
+ if (*p != U8_TBL_ELEMENT_FILLER)
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ p++;
+ }
+
+ /*
+ * If there was no match, we will need to save the combining
+ * mark for later appending. After that, if the next one
+ * is a non-Starter and not blocked, then, we try once
+ * again to do composition with the next non-Starter.
+ *
+ * If there was no match and this was a Starter, then,
+ * this is a new start.
+ *
+ * If there was a match and a composition done and we have
+ * more to check on, then, we retrieve a new composition final
+ * table entry for the composite and then try to do the
+ * composition again.
+ */
+
+ if (match_not_found) {
+ if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+ i--;
+ goto SAVE_THE_CHAR;
+ }
+
+ saved_marks[saved_marks_count++] = i;
+ }
+
+ if (saved_l == l) {
+ while (i < last) {
+ if (blocked(comb_class, i + 1))
+ saved_marks[saved_marks_count++] = ++i;
+ else
+ break;
+ }
+ if (i < last) {
+ p = saved_p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ } else if (i < last) {
+ p = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (p != NULL) {
+ saved_p = p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ }
+
+ /*
+ * There is no more composition possible.
+ *
+ * If there was no composition what so ever then we copy
+ * over the original Starter and then append any non-Starters
+ * remaining at the target string sequentially after that.
+ */
+
+ if (saved_l == l) {
+ p = s + start[saved_i];
+ size = disp[saved_i];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+
+ for (k = 0; k < saved_marks_count; k++) {
+ p = s + start[saved_marks[k]];
+ size = disp[saved_marks[k]];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+ }
+
+ /*
+ * If the last character is a Starter and if we have a character
+ * (possibly another Starter) that can be turned into a composite,
+ * we do so and we do so until there is no more of composition
+ * possible.
+ */
+ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+ p = *os;
+ saved_l = l - disp[last];
+
+ while (p < oslast) {
+ size = u8_number_of_bytes[*p];
+ if (size <= 1 || (p + size) > oslast)
+ break;
+
+ saved_p = p;
+
+ for (i = 0; i < size; i++)
+ tc[i] = *p++;
+
+ q = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (q == NULL) {
+ p = saved_p;
+ break;
+ }
+
+ match_not_found = B_TRUE;
+
+ for (C = *q++; C > 0; C--) {
+ for (k = 0; k < size; q++, k++)
+ if (*q != tc[k])
+ break;
+
+ if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++q != U8_TBL_ELEMENT_FILLER) {
+ /*
+ * This is practically
+ * impossible but we don't
+ * want to take any chances.
+ */
+ if (l >=
+ U8_STREAM_SAFE_TEXT_MAX) {
+ p = saved_p;
+ goto SAFE_RETURN;
+ }
+ t[l++] = *q;
+ }
+
+ break;
+ }
+
+ if (*q != U8_TBL_ELEMENT_FILLER)
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ q++;
+ }
+
+ if (match_not_found) {
+ p = saved_p;
+ break;
+ }
+ }
+SAFE_RETURN:
+ *os = p;
+ }
+
+ /*
+ * Now we copy over the temporary string to the target string.
+ * Since composition always reduces the number of characters or
+ * the number of characters stay, we don't need to worry about
+ * the buffer overflow here.
+ */
+ for (i = 0; i < l; i++)
+ s[i] = t[i];
+ s[l] = '\0';
+
+ return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+ boolean_t is_it_toupper,
+ boolean_t is_it_tolower,
+ boolean_t canonical_decomposition,
+ boolean_t compatibility_decomposition,
+ boolean_t canonical_composition,
+ int *errnum, u8_normalization_states_t *state)
+{
+ uchar_t *s;
+ int sz;
+ int saved_sz;
+ size_t i;
+ size_t j;
+ size_t k;
+ size_t l;
+ uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+ uchar_t disp[U8_MAX_CHARS_A_SEQ];
+ uchar_t start[U8_MAX_CHARS_A_SEQ];
+ uchar_t u8t[U8_MB_CUR_MAX];
+ uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc;
+ size_t last;
+ size_t saved_last;
+ uint32_t u1;
+
+ /*
+ * Save the source string pointer which we will return a changed
+ * pointer if we do processing.
+ */
+ s = *source;
+
+ /*
+ * The following is a fallback for just in case callers are not
+ * checking the string boundaries before the calling.
+ */
+ if (s >= slast) {
+ u8s[0] = '\0';
+
+ return (0);
+ }
+
+ /*
+ * As the first thing, let's collect a character and do case
+ * conversion if necessary.
+ */
+
+ sz = u8_number_of_bytes[*s];
+
+ if (sz < 0) {
+ *errnum = EILSEQ;
+
+ u8s[0] = *s++;
+ u8s[1] = '\0';
+
+ *source = s;
+
+ return (1);
+ }
+
+ if (sz == 1) {
+ if (is_it_toupper)
+ u8s[0] = U8_ASCII_TOUPPER(*s);
+ else if (is_it_tolower)
+ u8s[0] = U8_ASCII_TOLOWER(*s);
+ else
+ u8s[0] = *s;
+ s++;
+ u8s[1] = '\0';
+ } else if ((s + sz) > slast) {
+ *errnum = EINVAL;
+
+ for (i = 0; s < slast; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+
+ *source = s;
+
+ return (i);
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+ s += sz;
+ sz = i;
+ } else {
+ for (i = 0; i < sz; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+ }
+ }
+
+ /*
+ * And then canonical/compatibility decomposition followed by
+ * an optional canonical composition. Please be noted that
+ * canonical composition is done only when a decomposition is
+ * done.
+ */
+ if (canonical_decomposition || compatibility_decomposition) {
+ if (sz == 1) {
+ *state = U8_STATE_START;
+
+ saved_sz = 1;
+
+ comb_class[0] = 0;
+ start[0] = 0;
+ disp[0] = 1;
+
+ last = 1;
+ } else {
+ saved_sz = do_decomp(uv, u8s, u8s, sz,
+ canonical_decomposition, state);
+
+ last = 0;
+
+ for (i = 0; i < saved_sz; ) {
+ sz = u8_number_of_bytes[u8s[i]];
+
+ comb_class[last] = combining_class(uv,
+ u8s + i, sz);
+ start[last] = i;
+ disp[last] = sz;
+
+ last++;
+ i += sz;
+ }
+
+ /*
+ * Decomposition yields various Hangul related
+ * states but not on combining marks. We need to
+ * find out at here by checking on the last
+ * character.
+ */
+ if (*state == U8_STATE_START) {
+ if (comb_class[last - 1])
+ *state = U8_STATE_COMBINING_MARK;
+ }
+ }
+
+ saved_last = last;
+
+ while (s < slast) {
+ sz = u8_number_of_bytes[*s];
+
+ /*
+ * If this is an illegal character, an incomplete
+ * character, or an 7-bit ASCII Starter character,
+ * then we have collected a sequence; break and let
+ * the next call deal with the two cases.
+ *
+ * Note that this is okay only if you are using this
+ * function with a fixed length string, not on
+ * a buffer with multiple calls of one chunk at a time.
+ */
+ if (sz <= 1) {
+ break;
+ } else if ((s + sz) > slast) {
+ break;
+ } else {
+ /*
+ * If the previous character was a Hangul Jamo
+ * and this character is a Hangul Jamo that
+ * can be conjoined, we collect the Jamo.
+ */
+ if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1,
+ *s, *(s + 1), *(s + 2));
+
+ if (U8_HANGUL_COMPOSABLE_L_V(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LV;
+ goto COLLECT_A_HANGUL;
+ }
+
+ if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LVT;
+ goto COLLECT_A_HANGUL;
+ }
+ }
+
+ /*
+ * Regardless of whatever it was, if this is
+ * a Starter, we don't collect the character
+ * since that's a new start and we will deal
+ * with it at the next time.
+ */
+ i = combining_class(uv, s, sz);
+ if (i == U8_COMBINING_CLASS_STARTER)
+ break;
+
+ /*
+ * We know the current character is a combining
+ * mark. If the previous character wasn't
+ * a Starter (not Hangul) or a combining mark,
+ * then, we don't collect this combining mark.
+ */
+ if (*state != U8_STATE_START &&
+ *state != U8_STATE_COMBINING_MARK)
+ break;
+
+ *state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+ /*
+ * If we collected a Starter and combining
+ * marks up to 30, i.e., total 31 characters,
+ * then, we terminate this degenerately long
+ * combining sequence with a U+034F COMBINING
+ * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+ * UTF-8 and turn this into a Stream-Safe
+ * Text. This will be extremely rare but
+ * possible.
+ *
+ * The following will also guarantee that
+ * we are not writing more than 32 characters
+ * plus a NULL at u8s[].
+ */
+ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+ *state = U8_STATE_START;
+ comb_class[last] = 0;
+ start[last] = saved_sz;
+ disp[last] = 2;
+ last++;
+
+ u8s[saved_sz++] = 0xCD;
+ u8s[saved_sz++] = 0x8F;
+
+ break;
+ }
+
+ /*
+ * Some combining marks also do decompose into
+ * another combining mark or marks.
+ */
+ if (*state == U8_STATE_COMBINING_MARK) {
+ k = last;
+ l = sz;
+ i = do_decomp(uv, uts, s, sz,
+ canonical_decomposition, state);
+ for (j = 0; j < i; ) {
+ sz = u8_number_of_bytes[uts[j]];
+
+ comb_class[last] =
+ combining_class(uv,
+ uts + j, sz);
+ start[last] = saved_sz + j;
+ disp[last] = sz;
+
+ last++;
+ if (last >=
+ U8_UPPER_LIMIT_IN_A_SEQ) {
+ last = k;
+ goto TURN_STREAM_SAFE;
+ }
+ j += sz;
+ }
+
+ *state = U8_STATE_COMBINING_MARK;
+ sz = i;
+ s += l;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = uts[i];
+ } else {
+ comb_class[last] = i;
+ start[last] = saved_sz;
+ disp[last] = sz;
+ last++;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = *s++;
+ }
+
+ /*
+ * If this is U+0345 COMBINING GREEK
+ * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+ * iota subscript, and need to be converted to
+ * uppercase letter, convert it to U+0399 GREEK
+ * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+ * i.e., convert to capital adscript form as
+ * specified in the Unicode standard.
+ *
+ * This is the only special case of (ambiguous)
+ * case conversion at combining marks and
+ * probably the standard will never have
+ * anything similar like this in future.
+ */
+ if (is_it_toupper && sz >= 2 &&
+ u8s[saved_sz - 2] == 0xCD &&
+ u8s[saved_sz - 1] == 0x85) {
+ u8s[saved_sz - 2] = 0xCE;
+ u8s[saved_sz - 1] = 0x99;
+ }
+ }
+ }
+
+ /*
+ * Let's try to ensure a canonical ordering for the collected
+ * combining marks. We do this only if we have collected
+ * at least one more non-Starter. (The decomposition mapping
+ * data tables have fully (and recursively) expanded and
+ * canonically ordered decompositions.)
+ *
+ * The U8_SWAP_COMB_MARKS() convenience macro has some
+ * assumptions and we are meeting the assumptions.
+ */
+ last--;
+ if (last >= saved_last) {
+ for (i = 0; i < last; i++)
+ for (j = last; j > i; j--)
+ if (comb_class[j] &&
+ comb_class[j - 1] > comb_class[j]) {
+ U8_SWAP_COMB_MARKS(j - 1, j);
+ }
+ }
+
+ *source = s;
+
+ if (! canonical_composition) {
+ u8s[saved_sz] = '\0';
+ return (saved_sz);
+ }
+
+ /*
+ * Now do the canonical composition. Note that we do this
+ * only after a canonical or compatibility decomposition to
+ * finish up NFC or NFKC.
+ */
+ sz = do_composition(uv, u8s, comb_class, start, disp, last,
+ &s, slast);
+ }
+
+ *source = s;
+
+ return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparion based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+ int flag, int *errnum)
+{
+ int result;
+ size_t sz1;
+ size_t sz2;
+ uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t *s1last;
+ uchar_t *s2last;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ u8_normalization_states_t state;
+
+ s1last = s1 + n1;
+ s2last = s2 + n2;
+
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (s1 < s1last && s2 < s2last) {
+ /*
+ * If the current character is a 7-bit ASCII and the last
+ * character, or, if the current character and the next
+ * character are both some 7-bit ASCII characters then
+ * we treat the current character as a sequence.
+ *
+ * In any other cases, we need to call collect_a_seq().
+ */
+
+ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+ ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else if (is_it_tolower)
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ else
+ u8s1[0] = *s1;
+ u8s1[1] = '\0';
+ sz1 = 1;
+ s1++;
+ } else {
+ state = U8_STATE_START;
+ sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+ ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else if (is_it_tolower)
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ else
+ u8s2[0] = *s2;
+ u8s2[1] = '\0';
+ sz2 = 1;
+ s2++;
+ } else {
+ state = U8_STATE_START;
+ sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ /*
+ * Now compare the two characters. If they are the same,
+ * we move on to the next character sequences.
+ */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ result = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (result != 0)
+ return (result);
+ }
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one end, that means the other string has
+ * something which then can be used to determine the return value.
+ */
+ if (s1 >= s1last) {
+ if (s2 >= s2last)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+ int *errnum)
+{
+ int f;
+ size_t n1;
+ size_t n2;
+
+ *errnum = 0;
+
+ /*
+ * Check on the requested Unicode version, case conversion, and
+ * normalization flag values.
+ */
+
+ if (uv > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ uv = U8_UNICODE_LATEST;
+ }
+
+ if (flag == 0) {
+ flag = U8_STRCMP_CS;
+ } else {
+ f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+ U8_STRCMP_CI_LOWER);
+ if (f == 0) {
+ flag |= U8_STRCMP_CS;
+ } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+ f != U8_STRCMP_CI_LOWER) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+ f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+ }
+
+ if (flag == U8_STRCMP_CS) {
+ return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+ }
+
+ n1 = strlen(s1);
+ n2 = strlen(s2);
+ if (n != 0) {
+ if (n < n1)
+ n1 = n;
+ if (n < n2)
+ n2 = n;
+ }
+
+ /*
+ * Simple case conversion can be done much faster and so we do
+ * them separately here.
+ */
+ if (flag == U8_STRCMP_CI_UPPER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_TRUE, errnum));
+ } else if (flag == U8_STRCMP_CI_LOWER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_FALSE, errnum));
+ }
+
+ return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+ flag, errnum));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+ int flag, size_t unicode_version, int *errnum)
+{
+ int f;
+ int sz;
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t *ob;
+ uchar_t *obtail;
+ boolean_t do_not_ignore_null;
+ boolean_t do_not_ignore_invalid;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ size_t ret_val;
+ size_t i;
+ size_t j;
+ uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+ u8_normalization_states_t state;
+
+ if (unicode_version > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+ if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+ f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ if (inarray == NULL || *inlen == 0)
+ return (0);
+
+ if (outarray == NULL) {
+ *errnum = E2BIG;
+ return ((size_t)-1);
+ }
+
+ ib = (uchar_t *)inarray;
+ ob = (uchar_t *)outarray;
+ ibtail = ib + *inlen;
+ obtail = ob + *outlen;
+
+ do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+ do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+ ret_val = 0;
+
+ /*
+ * If we don't have a normalization flag set, we do the simple case
+ * conversion based text preparation separately below. Text
+ * preparation involving Normalization will be done in the false task
+ * block, again, separately since it will take much more time and
+ * resource than doing simple case conversions.
+ */
+ if (f == 0) {
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ sz = u8_number_of_bytes[*ib];
+
+ if (sz < 0) {
+ if (do_not_ignore_invalid) {
+ *errnum = EILSEQ;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ sz = 1;
+ ret_val++;
+ }
+
+ if (sz == 1) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else if ((ib + sz) > ibtail) {
+ if (do_not_ignore_invalid) {
+ *errnum = EINVAL;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < (ibtail - ib)) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ /*
+ * We treat the remaining incomplete character
+ * bytes as a character.
+ */
+ ret_val++;
+
+ while (ib < ibtail)
+ *ob++ = *ib++;
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(unicode_version, u8s,
+ ib, sz, is_it_toupper);
+
+ if ((obtail - ob) < i) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ ib += sz;
+
+ for (sz = 0; sz < i; sz++)
+ *ob++ = u8s[sz];
+ } else {
+ if ((obtail - ob) < sz) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < sz; i++)
+ *ob++ = *ib++;
+ }
+ }
+ }
+ } else {
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ /*
+ * If the current character is a 7-bit ASCII
+ * character and it is the last character, or,
+ * if the current character is a 7-bit ASCII
+ * character and the next character is also a 7-bit
+ * ASCII character, then, we copy over this
+ * character without going through collect_a_seq().
+ *
+ * In any other cases, we need to look further with
+ * the collect_a_seq() function.
+ */
+ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+ ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else {
+ *errnum = 0;
+ state = U8_STATE_START;
+
+ j = collect_a_seq(unicode_version, u8s,
+ &ib, ibtail,
+ is_it_toupper,
+ is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition,
+ errnum, &state);
+
+ if (*errnum && do_not_ignore_invalid) {
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < j) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < j; i++)
+ *ob++ = u8s[i];
+ }
+ }
+ }
+
+ *inlen = ibtail - ib;
+ *outlen = obtail - ob;
+
+ return (ret_val);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
new file mode 100644
index 000000000000..74517a3f6920
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file is intended for functions that ought to be common between user
+ * land (libzfs) and the kernel. When many common routines need to be shared
+ * then a separate file should to be created.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/nvpair.h>
+
+/*
+ * Are there allocatable vdevs?
+ */
+boolean_t
+zfs_allocatable_devs(nvlist_t *nv)
+{
+ uint64_t is_log;
+ uint_t c;
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ return (B_FALSE);
+ }
+ for (c = 0; c < children; c++) {
+ is_log = 0;
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (!is_log)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
new file mode 100644
index 000000000000..f517044a80a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_COMUTIL_H
+#define _ZFS_COMUTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_COMUTIL_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
new file mode 100644
index 000000000000..0fd5800a84dc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
@@ -0,0 +1,234 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#else
+#include <stdio.h>
+#include <unistd.h>
+#include <strings.h>
+#include <libnvpair.h>
+#include <ctype.h>
+#endif
+/* XXX includes zfs_context.h, so why bother with the above? */
+#include <sys/dsl_deleg.h>
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+
+/*
+ * permission table
+ *
+ * Keep this table in sorted order
+ *
+ * This table is used for displaying all permissions for
+ * zfs allow
+ */
+
+zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
+ {ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW},
+ {ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+ {ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+ {ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+ {ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+ {ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+ {ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+ {ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+ {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+ {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+ {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+ {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
+ {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+ {NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+static int
+zfs_valid_permission_name(const char *perm)
+{
+ if (zfs_deleg_canonicalize_perm(perm))
+ return (0);
+
+ return (permset_namecheck(perm, NULL, NULL));
+}
+
+const char *
+zfs_deleg_canonicalize_perm(const char *perm)
+{
+ int i;
+ zfs_prop_t prop;
+
+ for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
+ if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
+ return (perm);
+ }
+
+ prop = zfs_name_to_prop(perm);
+ if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
+ return (zfs_prop_to_name(prop));
+ return (NULL);
+
+}
+
+static int
+zfs_validate_who(char *who)
+{
+ char *p;
+
+ if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
+ return (-1);
+
+ switch (who[0]) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ for (p = &who[3]; *p; p++)
+ if (!isdigit(*p))
+ return (-1);
+ break;
+
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ return (permset_namecheck(&who[3], NULL, NULL));
+
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+zfs_deleg_verify_nvlist(nvlist_t *nvp)
+{
+ nvpair_t *who, *perm_name;
+ nvlist_t *perms;
+ int error;
+
+ if (nvp == NULL)
+ return (-1);
+
+ who = nvlist_next_nvpair(nvp, NULL);
+ if (who == NULL)
+ return (-1);
+
+ do {
+ if (zfs_validate_who(nvpair_name(who)))
+ return (-1);
+
+ error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
+
+ if (error && error != ENOENT)
+ return (-1);
+ if (error == ENOENT)
+ continue;
+
+ perm_name = nvlist_next_nvpair(perms, NULL);
+ if (perm_name == NULL) {
+ return (-1);
+ }
+ do {
+ error = zfs_valid_permission_name(
+ nvpair_name(perm_name));
+ if (error)
+ return (-1);
+ } while (perm_name = nvlist_next_nvpair(perms, perm_name));
+ } while (who = nvlist_next_nvpair(nvp, who));
+ return (0);
+}
+
+/*
+ * Construct the base attribute name. The base attribute names
+ * are the "key" to locate the jump objects which contain the actual
+ * permissions. The base attribute names are encoded based on
+ * type of entry and whether it is a local or descendent permission.
+ *
+ * Arguments:
+ * attr - attribute name return string, attribute is assumed to be
+ * ZFS_MAX_DELEG_NAME long.
+ * type - type of entry to construct
+ * inheritchr - inheritance type (local,descendent, or NA for create and
+ * permission set definitions
+ * data - is either a permission set name or a 64 bit uid/gid.
+ */
+void
+zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char inheritchr, void *data)
+{
+ int len = ZFS_MAX_DELEG_NAME;
+ uint64_t *id = data;
+
+ switch (type) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
+ break;
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ (void) snprintf(attr, len, "%c-%c%s", type,
+ ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
+ break;
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ (void) snprintf(attr, len, "%c-%c", type,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ (void) snprintf(attr, len, "%c%c%c", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ default:
+ ASSERT(!"bad zfs_deleg_who_type_t");
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
new file mode 100644
index 000000000000..561b73e63df4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_DELEG_H
+#define _ZFS_DELEG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
+#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */
+
+/*
+ * Max name length for a delegation attribute
+ */
+#define ZFS_MAX_DELEG_NAME 128
+
+#define ZFS_DELEG_LOCAL 'l'
+#define ZFS_DELEG_DESCENDENT 'd'
+#define ZFS_DELEG_NA '-'
+
+typedef enum {
+ ZFS_DELEG_NOTE_CREATE,
+ ZFS_DELEG_NOTE_DESTROY,
+ ZFS_DELEG_NOTE_SNAPSHOT,
+ ZFS_DELEG_NOTE_ROLLBACK,
+ ZFS_DELEG_NOTE_CLONE,
+ ZFS_DELEG_NOTE_PROMOTE,
+ ZFS_DELEG_NOTE_RENAME,
+ ZFS_DELEG_NOTE_RECEIVE,
+ ZFS_DELEG_NOTE_ALLOW,
+ ZFS_DELEG_NOTE_USERPROP,
+ ZFS_DELEG_NOTE_MOUNT,
+ ZFS_DELEG_NOTE_SHARE,
+ ZFS_DELEG_NOTE_NONE
+} zfs_deleg_note_t;
+
+typedef struct zfs_deleg_perm_tab {
+ char *z_perm;
+ zfs_deleg_note_t z_note;
+} zfs_deleg_perm_tab_t;
+
+extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
+
+int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
+void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char checkflag, void *data);
+const char *zfs_deleg_canonicalize_perm(const char *perm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_DELEG_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
index 2004d860d329..a9d109be20ab 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,7 +44,9 @@
#endif
#include <sys/param.h>
+#include <sys/nvpair.h>
#include "zfs_namecheck.h"
+#include "zfs_deleg.h"
static int
valid_char(char c)
@@ -52,7 +54,7 @@ valid_char(char c)
return ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
- c == '-' || c == '_' || c == '.' || c == ':');
+ c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
}
/*
@@ -90,6 +92,32 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
return (0);
}
+
+/*
+ * Permissions set name must start with the letter '@' followed by the
+ * same character restrictions as snapshot names, except that the name
+ * cannot exceed 64 characters.
+ */
+int
+permset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] != '@') {
+ if (why) {
+ *why = NAME_ERR_NO_AT;
+ *what = path[0];
+ }
+ return (-1);
+ }
+
+ return (snapshot_namecheck(&path[1], why, what));
+}
+
/*
* Dataset names must be of the following form:
*
@@ -98,7 +126,10 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
* Where each component is made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.:%]
+ *
+ * We allow '%' here as we use that character internally to create unique
+ * names for temporary clones (for online recv).
*/
int
dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -114,6 +145,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
* If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
* places using MAXNAMELEN.
*/
+
if (strlen(path) >= MAXNAMELEN) {
if (why)
*why = NAME_ERR_TOOLONG;
@@ -167,7 +199,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
/* Validate the contents of this component */
while (loc != end) {
- if (!valid_char(*loc)) {
+ if (!valid_char(*loc) && *loc != '%') {
if (why) {
*why = NAME_ERR_INVALCHAR;
*what = *loc;
@@ -211,6 +243,50 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
}
}
+
+/*
+ * mountpoint names must be of the following form:
+ *
+ * /[component][/]*[component][/]
+ */
+int
+mountpoint_namecheck(const char *path, namecheck_err_t *why)
+{
+ const char *start, *end;
+
+ /*
+ * Make sure none of the mountpoint component names are too long.
+ * If a component name is too long then the mkdir of the mountpoint
+ * will fail but then the mountpoint property will be set to a value
+ * that can never be mounted. Better to fail before setting the prop.
+ * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
+ */
+
+ if (path == NULL || *path != '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ /* Skip leading slash */
+ start = &path[1];
+ do {
+ end = start;
+ while (*end != '/' && *end != '\0')
+ end++;
+
+ if (end - start >= MAXNAMELEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+ start = end + 1;
+
+ } while (*end != '\0');
+
+ return (0);
+}
+
/*
* For pool names, we have the same set of valid characters as described in
* dataset names, with the additional restriction that the pool name must begin
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
index 7e0cda974cc6..ec85e62f72e8 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,12 +42,17 @@ typedef enum {
NAME_ERR_RESERVED, /* entire name is reserved */
NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */
NAME_ERR_TOOLONG, /* name is too long */
+ NAME_ERR_NO_AT, /* permission set is missing '@' */
} namecheck_err_t;
+#define ZFS_PERMSET_MAXLEN 64
+
int pool_namecheck(const char *, namecheck_err_t *, char *);
int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int mountpoint_namecheck(const char *, namecheck_err_t *);
int dataset_name_hidden(const char *);
int snapshot_namecheck(const char *, namecheck_err_t *, char *);
+int permset_namecheck(const char *, namecheck_err_t *, char *);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 71256192c092..27a6f2e3dd00 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -19,40 +19,19 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Master property table.
- *
- * This table keeps track of all the properties supported by ZFS, and their
- * various attributes. Not all of these are needed by the kernel, and several
- * are only used by a single libzfs client. But having them here centralizes
- * all property information in one location.
- *
- * name The human-readable string representing this property
- * proptype Basic type (string, boolean, number)
- * default Default value for the property. Sadly, C only allows
- * you to initialize the first member of a union, so we
- * have two default members for each property.
- * attr Attributes (readonly, inheritable) for the property
- * types Valid dataset types to which this applies
- * values String describing acceptable values for the property
- * colname The column header for 'zfs list'
- * colfmt The column formatting for 'zfs list'
- *
- * This table must match the order of property types in libzfs.h.
- */
-
#include <sys/zio.h>
#include <sys/spa.h>
+#include <sys/u8_textprep.h>
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
#include "zfs_prop.h"
+#include "zfs_deleg.h"
#if defined(_KERNEL)
#include <sys/systm.h>
@@ -62,244 +41,283 @@
#include <ctype.h>
#endif
-typedef enum {
- prop_default,
- prop_readonly,
- prop_inherit
-} prop_attr_t;
-
-typedef struct {
- const char *pd_name;
- zfs_proptype_t pd_proptype;
- uint64_t pd_numdefault;
- const char *pd_strdefault;
- prop_attr_t pd_attr;
- int pd_types;
- const char *pd_values;
- const char *pd_colname;
- boolean_t pd_rightalign;
- boolean_t pd_visible;
-} prop_desc_t;
-
-static prop_desc_t zfs_prop_table[] = {
- { "type", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE,
- B_TRUE },
- { "creation", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE },
- { "used", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE },
- { "available", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE,
- B_TRUE },
- { "referenced", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY,
- "<size>", "REFER", B_TRUE, B_TRUE },
- { "compressratio", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE,
- B_TRUE },
- { "mounted", prop_type_boolean, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE },
- { "origin", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN",
- B_FALSE, B_TRUE },
- { "quota", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE },
- { "reservation", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "<size> | none", "RESERV", B_TRUE, B_TRUE },
- { "volsize", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE },
- { "volblocksize", prop_type_number, 8192, NULL, prop_readonly,
- ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE,
- B_TRUE },
- { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL,
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE },
- { "mountpoint", prop_type_string, 0, "/", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE },
- { "sharenfs", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE },
- { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE,
- B_TRUE },
- { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE },
- { "atime", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "ATIME", B_TRUE, B_TRUE },
- { "devices", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "DEVICES", B_TRUE, B_TRUE },
- { "exec", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "EXEC", B_TRUE, B_TRUE },
- { "setuid", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
- B_TRUE, B_TRUE },
- { "readonly", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off", "RDONLY", B_TRUE, B_TRUE },
- { "jailed", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "JAILED", B_TRUE, B_TRUE },
- { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden",
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE },
- { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE },
- { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE,
- B_TRUE },
- { "createtxg", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE },
- { "name", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE },
- { "canmount", prop_type_boolean, 1, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "CANMOUNT", B_TRUE, B_TRUE },
- { "shareiscsi", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_ANY,
- "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE },
- { "iscsioptions", prop_type_string, 0, NULL, prop_inherit,
- ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE },
- { "xattr", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "XATTR", B_TRUE, B_TRUE },
- { "numclones", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE },
- { "copies", prop_type_index, 1, "1", prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE },
- { "bootfs", prop_type_string, 0, NULL, prop_default,
- ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE },
-};
-
-#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t)))
-
-/*
- * Returns TRUE if the property applies to the given dataset types.
- */
-int
-zfs_prop_valid_for_type(zfs_prop_t prop, int types)
-{
- return ((zfs_prop_table[prop].pd_types & types) != 0);
-}
-
-/*
- * Determine if the specified property is visible or not.
- */
-boolean_t
-zfs_prop_is_visible(zfs_prop_t prop)
-{
- if (prop < 0)
- return (B_FALSE);
-
- return (zfs_prop_table[prop].pd_visible);
-}
-
-/*
- * Iterate over all properties, calling back into the specified function
- * for each property. We will continue to iterate until we either
- * reach the end or the callback function something other than
- * ZFS_PROP_CONT.
- */
-zfs_prop_t
-zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type,
- boolean_t show_all)
-{
- int i;
-
- for (i = 0; i < ZFS_PROP_COUNT; i++) {
- if (zfs_prop_valid_for_type(i, type) &&
- (zfs_prop_is_visible(i) || show_all)) {
- if (func(i, cb) != ZFS_PROP_CONT)
- return (i);
- }
- }
- return (ZFS_PROP_CONT);
-}
+static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
-zfs_prop_t
-zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all)
+zprop_desc_t *
+zfs_prop_get_table(void)
{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all));
+ return (zfs_prop_table);
}
-zpool_prop_t
-zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all)
+void
+zfs_prop_init(void)
{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all));
-}
+ static zprop_index_t checksum_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
+ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { NULL }
+ };
+
+ static zprop_index_t compress_table[] = {
+ { "on", ZIO_COMPRESS_ON },
+ { "off", ZIO_COMPRESS_OFF },
+ { "lzjb", ZIO_COMPRESS_LZJB },
+ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
+ { "gzip-1", ZIO_COMPRESS_GZIP_1 },
+ { "gzip-2", ZIO_COMPRESS_GZIP_2 },
+ { "gzip-3", ZIO_COMPRESS_GZIP_3 },
+ { "gzip-4", ZIO_COMPRESS_GZIP_4 },
+ { "gzip-5", ZIO_COMPRESS_GZIP_5 },
+ { "gzip-6", ZIO_COMPRESS_GZIP_6 },
+ { "gzip-7", ZIO_COMPRESS_GZIP_7 },
+ { "gzip-8", ZIO_COMPRESS_GZIP_8 },
+ { "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { NULL }
+ };
+
+ static zprop_index_t snapdir_table[] = {
+ { "hidden", ZFS_SNAPDIR_HIDDEN },
+ { "visible", ZFS_SNAPDIR_VISIBLE },
+ { NULL }
+ };
+
+ static zprop_index_t acl_mode_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "groupmask", ZFS_ACL_GROUPMASK },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { NULL }
+ };
+
+ static zprop_index_t acl_inherit_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "noallow", ZFS_ACL_NOALLOW },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
+ { NULL }
+ };
+
+ static zprop_index_t case_table[] = {
+ { "sensitive", ZFS_CASE_SENSITIVE },
+ { "insensitive", ZFS_CASE_INSENSITIVE },
+ { "mixed", ZFS_CASE_MIXED },
+ { NULL }
+ };
+
+ static zprop_index_t copies_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { NULL }
+ };
-zfs_proptype_t
-zfs_prop_get_type(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_proptype);
-}
-
-static boolean_t
-propname_match(const char *p, zfs_prop_t prop, size_t len)
-{
- const char *propname = zfs_prop_table[prop].pd_name;
-#ifndef _KERNEL
- const char *colname = zfs_prop_table[prop].pd_colname;
- int c;
-#endif
-
-#ifndef _KERNEL
- if (colname == NULL)
- return (B_FALSE);
-#endif
-
- if (len == strlen(propname) &&
- strncmp(p, propname, len) == 0)
- return (B_TRUE);
-
-#ifndef _KERNEL
- if (len != strlen(colname))
- return (B_FALSE);
-
- for (c = 0; c < len; c++)
- if (p[c] != tolower(colname[c]))
- break;
-
- return (colname[c] == '\0');
-#else
- return (B_FALSE);
-#endif
-}
-
-zfs_prop_t
-zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data)
-{
- const char *propname = cb_data;
-
- if (propname_match(propname, prop, strlen(propname)))
- return (prop);
-
- return (ZFS_PROP_CONT);
+ /*
+ * Use the unique flags we have to send to u8_strcmp() and/or
+ * u8_textprep() to represent the various normalization property
+ * values.
+ */
+ static zprop_index_t normalize_table[] = {
+ { "none", 0 },
+ { "formD", U8_TEXTPREP_NFD },
+ { "formKC", U8_TEXTPREP_NFKC },
+ { "formC", U8_TEXTPREP_NFC },
+ { "formKD", U8_TEXTPREP_NFKD },
+ { NULL }
+ };
+
+ static zprop_index_t version_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { "current", ZPL_VERSION },
+ { NULL }
+ };
+
+ static zprop_index_t boolean_table[] = {
+ { "off", 0 },
+ { "on", 1 },
+ { NULL }
+ };
+
+ static zprop_index_t canmount_table[] = {
+ { "off", ZFS_CANMOUNT_OFF },
+ { "on", ZFS_CANMOUNT_ON },
+ { "noauto", ZFS_CANMOUNT_NOAUTO },
+ { NULL }
+ };
+
+ static zprop_index_t cache_table[] = {
+ { "none", ZFS_CACHE_NONE },
+ { "metadata", ZFS_CACHE_METADATA },
+ { "all", ZFS_CACHE_ALL },
+ { NULL }
+ };
+
+ /* inherit index properties */
+ register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
+ checksum_table);
+ register_index(ZFS_PROP_COMPRESSION, "compression",
+ ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
+ register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "hidden | visible", "SNAPDIR", snapdir_table);
+ register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
+ register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | noallow | restricted | passthrough",
+ "ACLINHERIT", acl_inherit_table);
+ register_index(ZFS_PROP_COPIES, "copies", 1,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "1 | 2 | 3", "COPIES", copies_table);
+ register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "PRIMARYCACHE", cache_table);
+ register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "SECONDARYCACHE", cache_table);
+
+ /* inherit index (boolean) properties */
+ register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
+ register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
+ boolean_table);
+ register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
+ boolean_table);
+ register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+ boolean_table);
+ register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
+ boolean_table);
+ register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+ register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
+ boolean_table);
+ register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
+ boolean_table);
+ register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
+ boolean_table);
+
+ /* default index properties */
+ register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "1 | 2 | 3 | current", "VERSION", version_table);
+ register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
+ "CANMOUNT", canmount_table);
+
+ /* readonly index (boolean) properties */
+ register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+
+ /* set once index properties */
+ register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "none | formC | formD | formKC | formKD", "NORMALIZATION",
+ normalize_table);
+ register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "sensitive | insensitive | mixed", "CASE", case_table);
+
+ /* set once index (boolean) properties */
+ register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "UTF8ONLY", boolean_table);
+
+ /* string properties */
+ register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
+ register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
+ register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
+ register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
+ ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
+ register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+ ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
+ register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+
+ /* readonly number properties */
+ register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "USED");
+ register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
+ register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "REFER");
+ register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<1.00x or higher if compressed>", "RATIO");
+ register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
+ PROP_ONETIME,
+ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
+ register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
+ register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
+ register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
+ register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+ PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+
+ /* default number properties */
+ register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
+ register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
+ register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+ ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+ register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+ register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "REFRESERV");
+
+ /* inherit number properties */
+ register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
+ PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+
+ /* hidden properties */
+ register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
+ register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
+ register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
+ PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+ register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
+ ZFS_TYPE_DATASET, "GUID");
+
+ /* oddball properties */
+ register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
}
-/*
- * Given a property name and its type, returns the corresponding property ID.
- */
-zfs_prop_t
-zfs_name_to_prop_common(const char *propname, zfs_type_t type)
+boolean_t
+zfs_prop_delegatable(zfs_prop_t prop)
{
- zfs_prop_t prop;
-
- prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname,
- type, B_TRUE);
- return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop);
+ zprop_desc_t *pd = &zfs_prop_table[prop];
+ return (pd->pd_attr != PROP_READONLY);
}
/*
@@ -308,17 +326,9 @@ zfs_name_to_prop_common(const char *propname, zfs_type_t type)
zfs_prop_t
zfs_name_to_prop(const char *propname)
{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY));
+ return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
}
-/*
- * Given a pool property name, returns the corresponding property ID.
- */
-zpool_prop_t
-zpool_name_to_prop(const char *propname)
-{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL));
-}
/*
* For user property names, we allow all lowercase alphanumeric characters, plus
@@ -357,179 +367,85 @@ zfs_prop_user(const char *name)
}
/*
- * Return the default value for the given property.
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
*/
-const char *
-zfs_prop_default_string(zfs_prop_t prop)
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
{
- return (zfs_prop_table[prop].pd_strdefault);
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
}
-uint64_t
-zfs_prop_default_numeric(zfs_prop_t prop)
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
{
- return (zfs_prop_table[prop].pd_numdefault);
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
}
/*
- * Returns TRUE if the property is readonly.
+ * Returns TRUE if the property applies to any of the given dataset types.
*/
-int
-zfs_prop_readonly(zfs_prop_t prop)
+boolean_t
+zfs_prop_valid_for_type(int prop, zfs_type_t types)
{
- return (zfs_prop_table[prop].pd_attr == prop_readonly);
+ return (zprop_valid_for_type(prop, types));
}
-/*
- * Given a dataset property ID, returns the corresponding name.
- * Assuming the zfs dataset propety ID is valid.
- */
-const char *
-zfs_prop_to_name(zfs_prop_t prop)
+zprop_type_t
+zfs_prop_get_type(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_name);
+ return (zfs_prop_table[prop].pd_proptype);
}
/*
- * Given a pool property ID, returns the corresponding name.
- * Assuming the pool propety ID is valid.
+ * Returns TRUE if the property is readonly.
*/
-const char *
-zpool_prop_to_name(zpool_prop_t prop)
+boolean_t
+zfs_prop_readonly(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_name);
+ return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
/*
- * Returns TRUE if the property is inheritable.
+ * Returns TRUE if the property is only allowed to be set once.
*/
-int
-zfs_prop_inheritable(zfs_prop_t prop)
+boolean_t
+zfs_prop_setonce(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_attr == prop_inherit);
+ return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
-typedef struct zfs_index {
- const char *name;
- uint64_t index;
-} zfs_index_t;
-
-static zfs_index_t checksum_table[] = {
- { "on", ZIO_CHECKSUM_ON },
- { "off", ZIO_CHECKSUM_OFF },
- { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
- { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
- { "sha256", ZIO_CHECKSUM_SHA256 },
- { NULL }
-};
-
-static zfs_index_t compress_table[] = {
- { "on", ZIO_COMPRESS_ON },
- { "off", ZIO_COMPRESS_OFF },
- { "lzjb", ZIO_COMPRESS_LZJB },
- { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */
- { "gzip-1", ZIO_COMPRESS_GZIP_1 },
- { "gzip-2", ZIO_COMPRESS_GZIP_2 },
- { "gzip-3", ZIO_COMPRESS_GZIP_3 },
- { "gzip-4", ZIO_COMPRESS_GZIP_4 },
- { "gzip-5", ZIO_COMPRESS_GZIP_5 },
- { "gzip-6", ZIO_COMPRESS_GZIP_6 },
- { "gzip-7", ZIO_COMPRESS_GZIP_7 },
- { "gzip-8", ZIO_COMPRESS_GZIP_8 },
- { "gzip-9", ZIO_COMPRESS_GZIP_9 },
- { NULL }
-};
-
-static zfs_index_t snapdir_table[] = {
- { "hidden", ZFS_SNAPDIR_HIDDEN },
- { "visible", ZFS_SNAPDIR_VISIBLE },
- { NULL }
-};
-
-static zfs_index_t acl_mode_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "groupmask", ZFS_ACL_GROUPMASK },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t acl_inherit_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "noallow", ZFS_ACL_NOALLOW },
- { "secure", ZFS_ACL_SECURE },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t copies_table[] = {
- { "1", 1 },
- { "2", 2 },
- { "3", 3 },
- { NULL }
-};
-
-static zfs_index_t *
-zfs_prop_index_table(zfs_prop_t prop)
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
{
- switch (prop) {
- case ZFS_PROP_CHECKSUM:
- return (checksum_table);
- case ZFS_PROP_COMPRESSION:
- return (compress_table);
- case ZFS_PROP_SNAPDIR:
- return (snapdir_table);
- case ZFS_PROP_ACLMODE:
- return (acl_mode_table);
- case ZFS_PROP_ACLINHERIT:
- return (acl_inherit_table);
- case ZFS_PROP_COPIES:
- return (copies_table);
- default:
- return (NULL);
- }
+ return (zfs_prop_table[prop].pd_strdefault);
}
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_numdefault);
+}
/*
- * Tables of index types, plus functions to convert between the user view
- * (strings) and internal representation (uint64_t).
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset property ID is valid.
*/
-int
-zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (strcmp(string, table[i].name) == 0) {
- *index = table[i].index;
- return (0);
- }
- }
-
- return (-1);
+ return (zfs_prop_table[prop].pd_name);
}
-int
-zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+boolean_t
+zfs_prop_inheritable(zfs_prop_t prop)
{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (table[i].index == index) {
- *string = table[i].name;
- return (0);
- }
- }
-
- return (-1);
+ return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
#ifndef _KERNEL
@@ -541,22 +457,6 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
const char *
zfs_prop_values(zfs_prop_t prop)
{
- if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL)
- return (NULL);
-
- return (zfs_prop_table[prop].pd_values);
-}
-
-/*
- * Returns a string describing the set of acceptable values for the given
- * zpool property, or NULL if it cannot be set.
- */
-const char *
-zpool_prop_values(zfs_prop_t prop)
-{
- if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL)
- return (NULL);
-
return (zfs_prop_table[prop].pd_values);
}
@@ -568,8 +468,8 @@ zpool_prop_values(zfs_prop_t prop)
int
zfs_prop_is_string(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_proptype == prop_type_string ||
- zfs_prop_table[prop].pd_proptype == prop_type_index);
+ return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
+ zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
}
/*
@@ -592,66 +492,4 @@ zfs_prop_align_right(zfs_prop_t prop)
return (zfs_prop_table[prop].pd_rightalign);
}
-/*
- * Determines the minimum width for the column, and indicates whether it's fixed
- * or not. Only string columns are non-fixed.
- */
-size_t
-zfs_prop_width(zfs_prop_t prop, boolean_t *fixed)
-{
- prop_desc_t *pd = &zfs_prop_table[prop];
- zfs_index_t *idx;
- size_t ret;
- int i;
-
- *fixed = B_TRUE;
-
- /*
- * Start with the width of the column name.
- */
- ret = strlen(pd->pd_colname);
-
- /*
- * For fixed-width values, make sure the width is large enough to hold
- * any possible value.
- */
- switch (pd->pd_proptype) {
- case prop_type_number:
- /*
- * The maximum length of a human-readable number is 5 characters
- * ("20.4M", for example).
- */
- if (ret < 5)
- ret = 5;
- /*
- * 'creation' is handled specially because it's a number
- * internally, but displayed as a date string.
- */
- if (prop == ZFS_PROP_CREATION)
- *fixed = B_FALSE;
- break;
- case prop_type_boolean:
- /*
- * The maximum length of a boolean value is 3 characters, for
- * "off".
- */
- if (ret < 3)
- ret = 3;
- break;
- case prop_type_index:
- idx = zfs_prop_index_table(prop);
- for (i = 0; idx[i].name != NULL; i++) {
- if (strlen(idx[i].name) > ret)
- ret = strlen(idx[i].name);
- }
- break;
-
- case prop_type_string:
- *fixed = B_FALSE;
- break;
- }
-
- return (ret);
-}
-
#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
index 133e740ce6bc..da5ae43093e5 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -40,14 +40,87 @@ extern "C" {
* in the kernel, but the string value in userland.
*/
typedef enum {
- prop_type_number, /* numeric value */
- prop_type_string, /* string value */
- prop_type_boolean, /* boolean value */
- prop_type_index /* numeric value indexed by string */
-} zfs_proptype_t;
-
-zfs_proptype_t zfs_prop_get_type(zfs_prop_t);
-size_t zfs_prop_width(zfs_prop_t, boolean_t *);
+ PROP_TYPE_NUMBER, /* numeric value */
+ PROP_TYPE_STRING, /* string value */
+ PROP_TYPE_INDEX /* numeric value indexed by string */
+} zprop_type_t;
+
+typedef enum {
+ PROP_DEFAULT,
+ PROP_READONLY,
+ PROP_INHERIT,
+ /*
+ * ONETIME properties are a sort of conglomeration of READONLY
+ * and INHERIT. They can be set only during object creation,
+ * after that they are READONLY. If not explicitly set during
+ * creation, they can be inherited.
+ */
+ PROP_ONETIME
+} zprop_attr_t;
+
+typedef struct zfs_index {
+ const char *pi_name;
+ uint64_t pi_value;
+} zprop_index_t;
+
+typedef struct {
+ const char *pd_name; /* human-readable property name */
+ int pd_propnum; /* property number */
+ zprop_type_t pd_proptype; /* string, boolean, index, number */
+ const char *pd_strdefault; /* default for strings */
+ uint64_t pd_numdefault; /* for boolean / index / number */
+ zprop_attr_t pd_attr; /* default, readonly, inherit */
+ int pd_types; /* bitfield of valid dataset types */
+ /* fs | vol | snap; or pool */
+ const char *pd_values; /* string telling acceptable values */
+ const char *pd_colname; /* column header for "zfs list" */
+ boolean_t pd_rightalign; /* column alignment for "zfs list" */
+ boolean_t pd_visible; /* do we list this property with the */
+ /* "zfs get" help message */
+ const zprop_index_t *pd_table; /* for index properties, a table */
+ /* defining the possible values */
+} zprop_desc_t;
+
+/*
+ * zfs dataset property functions
+ */
+void zfs_prop_init(void);
+zprop_type_t zfs_prop_get_type(zfs_prop_t);
+boolean_t zfs_prop_delegatable(zfs_prop_t prop);
+zprop_desc_t *zfs_prop_get_table(void);
+
+/*
+ * zpool property functions
+ */
+void zpool_prop_init(void);
+zprop_type_t zpool_prop_get_type(zpool_prop_t);
+zprop_desc_t *zpool_prop_get_table(void);
+
+/*
+ * Common routines to initialize property tables
+ */
+void register_impl(int, const char *, zprop_type_t, uint64_t,
+ const char *, zprop_attr_t, int, const char *, const char *,
+ boolean_t, boolean_t, const zprop_index_t *);
+void register_string(int, const char *, const char *, zprop_attr_t attr,
+ int, const char *, const char *);
+void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *);
+void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *, const zprop_index_t *);
+void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+ int, const char *);
+
+/*
+ * Common routines for zfs and zpool property management
+ */
+int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
+int zprop_name_to_prop(const char *, zfs_type_t);
+int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
+int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+const char *zprop_values(int, zfs_type_t);
+size_t zprop_width(int, boolean_t *, zfs_type_t);
+boolean_t zprop_valid_for_type(int, zfs_type_t);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
new file mode 100644
index 000000000000..f5efe18d248b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
@@ -0,0 +1,186 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
+
+zprop_desc_t *
+zpool_prop_get_table(void)
+{
+ return (zpool_prop_table);
+}
+
+void
+zpool_prop_init(void)
+{
+ static zprop_index_t boolean_table[] = {
+ { "off", 0},
+ { "on", 1},
+ { NULL }
+ };
+
+ static zprop_index_t failuremode_table[] = {
+ { "wait", ZIO_FAILURE_MODE_WAIT },
+ { "continue", ZIO_FAILURE_MODE_CONTINUE },
+ { "panic", ZIO_FAILURE_MODE_PANIC },
+ { NULL }
+ };
+
+ /* string properties */
+ register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<path>", "ALTROOT");
+ register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
+ register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+
+ /* readonly number properties */
+ register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "SIZE");
+ register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "USED");
+ register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "AVAIL");
+ register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "CAP");
+ register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<guid>", "GUID");
+ register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<state>", "HEALTH");
+
+ /* default number properties */
+ register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+
+ /* default index (boolean) properties */
+ register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
+ register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+ register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
+
+ /* default index properties */
+ register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+ ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "wait | continue | panic", "FAILMODE", failuremode_table);
+
+ /* hidden properties */
+ register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool propety ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_name);
+}
+
+zprop_type_t
+zpool_prop_get_type(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_proptype);
+}
+
+boolean_t
+zpool_prop_readonly(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
+}
+
+const char *
+zpool_prop_default_string(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zpool_prop_default_numeric(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_numdefault);
+}
+
+int
+zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
+ uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
+}
+
+int
+zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
+ const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
+}
+
+#ifndef _KERNEL
+
+const char *
+zpool_prop_values(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_values);
+}
+
+const char *
+zpool_prop_column_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_colname);
+}
+
+boolean_t
+zpool_prop_align_right(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_rightalign);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
new file mode 100644
index 000000000000..87619e1cbf07
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Common routines used by zfs and zpool property management.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/libkern.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t *
+zprop_get_proptable(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (zpool_prop_get_table());
+ else
+ return (zfs_prop_get_table());
+}
+
+static int
+zprop_get_numprops(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (ZPOOL_NUM_PROPS);
+ else
+ return (ZFS_NUM_PROPS);
+}
+
+void
+register_impl(int prop, const char *name, zprop_type_t type,
+ uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
+{
+ zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
+ zprop_desc_t *pd;
+
+ pd = &prop_tbl[prop];
+
+ ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+
+ pd->pd_name = name;
+ pd->pd_propnum = prop;
+ pd->pd_proptype = type;
+ pd->pd_numdefault = numdefault;
+ pd->pd_strdefault = strdefault;
+ pd->pd_attr = attr;
+ pd->pd_types = objset_types;
+ pd->pd_values = values;
+ pd->pd_colname = colname;
+ pd->pd_rightalign = rightalign;
+ pd->pd_visible = visible;
+ pd->pd_table = idx_tbl;
+}
+
+void
+register_string(int prop, const char *name, const char *def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+ objset_types, values, colname, B_FALSE, B_TRUE, NULL);
+
+}
+
+void
+register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname)
+{
+ register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, NULL);
+}
+
+void
+register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ const zprop_index_t *idx_tbl)
+{
+ register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
+}
+
+void
+register_hidden(int prop, const char *name, zprop_type_t type,
+ zprop_attr_t attr, int objset_types, const char *colname)
+{
+ register_impl(prop, name, type, 0, NULL, attr,
+ objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
+}
+
+
+/*
+ * A comparison function we can use to order indexes into property tables.
+ */
+static int
+zprop_compare(const void *arg1, const void *arg2)
+{
+ const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
+ const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
+ boolean_t p1ro, p2ro;
+
+ p1ro = (p1->pd_attr == PROP_READONLY);
+ p2ro = (p2->pd_attr == PROP_READONLY);
+
+ if (p1ro == p2ro)
+ return (strcmp(p1->pd_name, p2->pd_name));
+
+ return (p1ro ? -1 : 1);
+}
+
+/*
+ * Iterate over all properties in the given property table, calling back
+ * into the specified function for each property. We will continue to
+ * iterate until we either reach the end or the callback function returns
+ * something other than ZPROP_CONT.
+ */
+int
+zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
+ boolean_t ordered, zfs_type_t type)
+{
+ int i, j, num_props, size, prop;
+ zprop_desc_t *prop_tbl;
+ zprop_desc_t **order;
+
+ prop_tbl = zprop_get_proptable(type);
+ num_props = zprop_get_numprops(type);
+ size = num_props * sizeof (zprop_desc_t *);
+
+#if defined(_KERNEL)
+ order = kmem_alloc(size, KM_SLEEP);
+#else
+ if ((order = malloc(size)) == NULL)
+ return (ZPROP_CONT);
+#endif
+
+ for (j = 0; j < num_props; j++)
+ order[j] = &prop_tbl[j];
+
+ if (ordered) {
+ qsort((void *)order, num_props, sizeof (zprop_desc_t *),
+ zprop_compare);
+ }
+
+ prop = ZPROP_CONT;
+ for (i = 0; i < num_props; i++) {
+ if ((order[i]->pd_visible || show_all) &&
+ (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
+ prop = order[i]->pd_propnum;
+ break;
+ }
+ }
+
+#if defined(_KERNEL)
+ kmem_free(order, size);
+#else
+ free(order);
+#endif
+ return (prop);
+}
+
+static boolean_t
+propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
+{
+ const char *propname = prop_entry->pd_name;
+#ifndef _KERNEL
+ const char *colname = prop_entry->pd_colname;
+ int c;
+
+ if (colname == NULL)
+ return (B_FALSE);
+#endif
+
+ if (len == strlen(propname) &&
+ strncmp(p, propname, len) == 0)
+ return (B_TRUE);
+
+#ifndef _KERNEL
+ if (len != strlen(colname))
+ return (B_FALSE);
+
+ for (c = 0; c < len; c++)
+ if (p[c] != tolower(colname[c]))
+ break;
+
+ return (colname[c] == '\0');
+#else
+ return (B_FALSE);
+#endif
+}
+
+typedef struct name_to_prop_cb {
+ const char *propname;
+ zprop_desc_t *prop_tbl;
+} name_to_prop_cb_t;
+
+static int
+zprop_name_to_prop_cb(int prop, void *cb_data)
+{
+ name_to_prop_cb_t *data = cb_data;
+
+ if (propname_match(data->propname, strlen(data->propname),
+ &data->prop_tbl[prop]))
+ return (prop);
+
+ return (ZPROP_CONT);
+}
+
+int
+zprop_name_to_prop(const char *propname, zfs_type_t type)
+{
+ int prop;
+ name_to_prop_cb_t cb_data;
+
+ cb_data.propname = propname;
+ cb_data.prop_tbl = zprop_get_proptable(type);
+
+ prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
+ B_TRUE, B_FALSE, type);
+
+ return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
+}
+
+int
+zprop_string_to_index(int prop, const char *string, uint64_t *index,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (strcmp(string, idx_tbl[i].pi_name) == 0) {
+ *index = idx_tbl[i].pi_value;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+int
+zprop_index_to_string(int prop, uint64_t index, const char **string,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (idx_tbl[i].pi_value == index) {
+ *string = idx_tbl[i].pi_name;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+const char *
+zprop_values(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+
+ return (prop_tbl[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zprop_valid_for_type(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (B_FALSE);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ return ((prop_tbl[prop].pd_types & type) != 0);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not. Only string columns are non-fixed.
+ */
+size_t
+zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl, *pd;
+ const zprop_index_t *idx;
+ size_t ret;
+ int i;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+ pd = &prop_tbl[prop];
+
+ *fixed = B_TRUE;
+
+ /*
+ * Start with the width of the column name.
+ */
+ ret = strlen(pd->pd_colname);
+
+ /*
+ * For fixed-width values, make sure the width is large enough to hold
+ * any possible value.
+ */
+ switch (pd->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ /*
+ * The maximum length of a human-readable number is 5 characters
+ * ("20.4M", for example).
+ */
+ if (ret < 5)
+ ret = 5;
+ /*
+ * 'creation' is handled specially because it's a number
+ * internally, but displayed as a date string.
+ */
+ if (prop == ZFS_PROP_CREATION)
+ *fixed = B_FALSE;
+ break;
+ case PROP_TYPE_INDEX:
+ idx = prop_tbl[prop].pd_table;
+ for (i = 0; idx[i].pi_name != NULL; i++) {
+ if (strlen(idx[i].pi_name) > ret)
+ ret = strlen(idx[i].pi_name);
+ }
+ break;
+
+ case PROP_TYPE_STRING:
+ *fixed = B_FALSE;
+ break;
+ }
+
+ return (ret);
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 1800e792fa1e..cf49c78a5b0e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -20,11 +20,9 @@
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This Makefile defines all file modules for the directory uts/common
# and its children. These are the source files which may be considered
# common to all SunOS systems.
@@ -46,7 +44,9 @@ ZFS_COMMON_OBJS += \
dsl_pool.o \
dsl_synctask.o \
dmu_zfetch.o \
+ dsl_deleg.o \
dsl_prop.o \
+ dsl_scrub.o \
fletcher.o \
gzip.o \
lzjb.o \
@@ -64,6 +64,7 @@ ZFS_COMMON_OBJS += \
unique.o \
vdev.o \
vdev_cache.o \
+ vdev_file.o \
vdev_label.o \
vdev_mirror.o \
vdev_missing.o \
@@ -75,6 +76,7 @@ ZFS_COMMON_OBJS += \
zap_micro.o \
zfs_byteswap.o \
zfs_fm.o \
+ zfs_fuid.o \
zfs_znode.o \
zil.o \
zio.o \
@@ -84,7 +86,11 @@ ZFS_COMMON_OBJS += \
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
- zfs_prop.o
+ zfs_deleg.o \
+ zfs_prop.o \
+ zfs_comutil.o \
+ zpool_prop.o \
+ zprop_common.o
ZFS_OBJS += \
$(ZFS_COMMON_OBJS) \
@@ -96,6 +102,7 @@ ZFS_OBJS += \
zfs_log.o \
zfs_replay.o \
zfs_rlock.o \
+ rrwlock.o \
zfs_vfsops.o \
zfs_vnops.o \
zvol.o
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index dd2aa82304ab..d9eb88a40202 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -20,7 +20,7 @@
*/
/* Portions Copyright 2007 Shivakumar GN */
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/mutex.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
+#include <sys/sunddi.h>
#include <sys/uio.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
@@ -60,7 +61,7 @@
*
* These routines are designed to play a support role for existing
* pseudo-filesystems (such as procfs). They simplify common tasks,
- * without enforcing the filesystem to hand over management to GFS. The
+ * without forcing the filesystem to hand over management to GFS. The
* routines covered are:
*
* gfs_readdir_init()
@@ -116,6 +117,42 @@
*/
/*
+ * gfs_get_parent_ino: used to obtain a parent inode number and the
+ * inode number of the given vnode in preparation for calling gfs_readdir_init.
+ */
+int
+gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ ino64_t *pino, ino64_t *ino)
+{
+ vnode_t *parent;
+ gfs_dir_t *dp = dvp->v_data;
+ int error;
+
+ *ino = dp->gfsd_file.gfs_ino;
+ parent = dp->gfsd_file.gfs_parent;
+
+ if (parent == NULL) {
+ *pino = *ino; /* root of filesystem */
+ } else if (dvp->v_flag & V_XATTRDIR) {
+#ifdef TODO
+ vattr_t va;
+
+ va.va_mask = AT_NODEID;
+ error = VOP_GETATTR(parent, &va, 0, cr, ct);
+ if (error)
+ return (error);
+ *pino = va.va_nodeid;
+#else
+ panic("%s:%u: not implemented", __func__, __LINE__);
+#endif
+ } else {
+ *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
+ }
+
+ return (0);
+}
+
+/*
* gfs_readdir_init: initiate a generic readdir
* st - a pointer to an uninitialized gfs_readdir_state_t structure
* name_max - the directory's maximum file name length
@@ -123,6 +160,7 @@
* uiop - the uiop passed to readdir
* parent - the parent directory's inode
* self - this directory's inode
+ * flags - flags from VOP_READDIR
*
* Returns 0 or a non-zero errno.
*
@@ -153,8 +191,10 @@
*/
int
gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
- uio_t *uiop, ino64_t parent, ino64_t self)
+ uio_t *uiop, ino64_t parent, ino64_t self, int flags)
{
+ size_t dirent_size;
+
if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
(uiop->uio_loffset % ureclen) != 0)
return (EINVAL);
@@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
st->grd_ureclen = ureclen;
st->grd_oresid = uiop->uio_resid;
st->grd_namlen = name_max;
- st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
+ if (flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
st->grd_parent = parent;
st->grd_self = self;
+ st->grd_flags = flags;
return (0);
}
@@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
/*
* gfs_readdir_emit_int: internal routine to emit directory entry
*
- * st - the current readdir state, which must have d_ino and d_name
- * set
+ * st - the current readdir state, which must have d_ino/ed_ino
+ * and d_name/ed_name set
* uiop - caller-supplied uio pointer
* next - the offset of the next entry
*/
@@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
int *ncookies, u_long **cookies)
{
int reclen, namlen;
+ dirent64_t *dp;
+ edirent_t *edp;
- namlen = strlen(st->grd_dirent->d_name);
- reclen = DIRENT64_RECLEN(namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp = st->grd_dirent;
+ namlen = strlen(edp->ed_name);
+ reclen = EDIRENT_RECLEN(namlen);
+ } else {
+ dp = st->grd_dirent;
+ namlen = strlen(dp->d_name);
+ reclen = DIRENT64_RECLEN(namlen);
+ }
if (reclen > uiop->uio_resid) {
/*
@@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
return (-1);
}
- /* XXX: This can change in the future. */
- st->grd_dirent->d_type = DT_DIR;
- st->grd_dirent->d_reclen = (ushort_t)reclen;
- st->grd_dirent->d_namlen = namlen;
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp->ed_off = next;
+ edp->ed_reclen = (ushort_t)reclen;
+ } else {
+ /* XXX: This can change in the future. */
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_type = DT_DIR;
+ dp->d_namlen = namlen;
+ }
if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
return (EFAULT);
@@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
* voff - the virtual offset (obtained from gfs_readdir_pred)
* ino - the entry's inode
* name - the entry's name
+ * eflags - value for ed_eflags (if processing edirent_t)
*
* Returns a 0 on success, a non-zero errno on failure, or -1 if the
* readdir loop should terminate. A non-zero result (either errno or
@@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
*/
int
gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
- ino64_t ino, const char *name, int *ncookies, u_long **cookies)
+ ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies)
{
offset_t off = (voff + 2) * st->grd_ureclen;
- st->grd_dirent->d_ino = ino;
- (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edirent_t *edp = st->grd_dirent;
+
+ edp->ed_ino = ino;
+ (void) strncpy(edp->ed_name, name, st->grd_namlen);
+ edp->ed_eflags = eflags;
+ } else {
+ dirent64_t *dp = st->grd_dirent;
+
+ dp->d_ino = ino;
+ (void) strncpy(dp->d_name, name, st->grd_namlen);
+ }
/*
* Inter-entry offsets are invalid, so we assume a record size of
@@ -266,11 +336,11 @@ top:
voff = off - 2;
if (off == 0) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
- ".", ncookies, cookies)) == 0)
+ ".", 0, ncookies, cookies)) == 0)
goto top;
} else if (off == 1) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
- "..", ncookies, cookies)) == 0)
+ "..", 0, ncookies, cookies)) == 0)
goto top;
} else {
*voffp = voff;
@@ -292,7 +362,13 @@ top:
int
gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
{
- kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
+ size_t dirent_size;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ kmem_free(st->grd_dirent, dirent_size);
if (error > 0)
return (error);
if (eofp)
@@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp)
gfs_dir_t *dp = NULL;
void *data;
- if (fp->gfs_parent == NULL)
+ if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
goto found;
dp = fp->gfs_parent->v_data;
@@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp)
ge = NULL;
found:
+ if (vp->v_flag & V_XATTRDIR)
+ VI_LOCK(fp->gfs_parent);
VI_LOCK(vp);
ASSERT(vp->v_count < 2);
/*
@@ -535,7 +613,8 @@ found:
* Free vnode and release parent
*/
if (fp->gfs_parent) {
- gfs_dir_unlock(dp);
+ if (dp)
+ gfs_dir_unlock(dp);
VI_LOCK(fp->gfs_parent);
fp->gfs_parent->v_usecount--;
VI_UNLOCK(fp->gfs_parent);
@@ -543,6 +622,8 @@ found:
ASSERT(vp->v_vfsp != NULL);
VFS_RELE(vp->v_vfsp);
}
+ if (vp->v_flag & V_XATTRDIR)
+ VI_UNLOCK(fp->gfs_parent);
return (data);
}
@@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp)
}
/*
- * gfs_dir_lookup()
+ * gfs_dir_lookup_dynamic()
*
- * Looks up the given name in the directory and returns the corresponding vnode,
- * if found.
+ * This routine looks up the provided name amongst the dynamic entries
+ * in the gfs directory and returns the corresponding vnode, if found.
*
- * First, we search statically defined entries, if any. If a match is found,
- * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
- * existing vnode. Otherwise, we call the static entry's callback routine,
- * caching the result if necessary.
+ * The gfs directory is expected to be locked by the caller prior to
+ * calling this function. The directory will be unlocked during the
+ * execution of this function, but will be locked upon return from the
+ * function. This function returns 0 on success, non-zero on error.
*
- * If no static entry is found, we invoke the lookup callback, if any. The
- * arguments to this callback are:
+ * The dynamic lookups are performed by invoking the lookup
+ * callback, which is passed to this function as the first argument.
+ * The arguments to the callback are:
*
- * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ * int flags, int *deflgs, pathname_t *rpnp);
*
* pvp - parent vnode
* nm - name of entry
* vpp - pointer to resulting vnode
+ * cr - pointer to cred
+ * flags - flags value from lookup request
+ * ignored here; currently only used to request
+ * insensitive lookups
+ * direntflgs - output parameter, directory entry flags
+ * ignored here; currently only used to indicate a lookup
+ * has more than one possible match when case is not considered
+ * realpnp - output parameter, real pathname
+ * ignored here; when lookup was performed case-insensitively,
+ * this field contains the "real" name of the file.
*
* Returns 0 on success, non-zero on error.
*/
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
+static int
+gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
+ const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
+ int *direntflags, pathname_t *realpnp)
{
- int i;
- gfs_dirent_t *ge;
- vnode_t *vp;
- gfs_dir_t *dp = dvp->v_data;
- int ret = 0;
-
- ASSERT(dvp->v_type == VDIR);
+ gfs_file_t *fp;
+ ino64_t ino;
+ int ret;
- if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
- return (0);
+ ASSERT(GFS_DIR_LOCKED(dp));
+ /*
+ * Drop the directory lock, as the lookup routine
+ * will need to allocate memory, or otherwise deadlock on this
+ * directory.
+ */
+ gfs_dir_unlock(dp);
+ ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
gfs_dir_lock(dp);
/*
+ * The callback for extended attributes returns a vnode
+ * with v_data from an underlying fs.
+ */
+ if (ret == 0 && !IS_XATTRDIR(dvp)) {
+ fp = (gfs_file_t *)((*vpp)->v_data);
+ fp->gfs_index = -1;
+ fp->gfs_ino = ino;
+ }
+
+ return (ret);
+}
+
+/*
+ * gfs_dir_lookup_static()
+ *
+ * This routine looks up the provided name amongst the static entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ * The first argument to the function is a pointer to the comparison
+ * function this function should use to decide if names are a match.
+ *
+ * If a match is found, and GFS_CACHE_VNODE is set and the vnode
+ * exists, we simply return the existing vnode. Otherwise, we call
+ * the static entry's callback routine, caching the result if
+ * necessary. If the idx pointer argument is non-NULL, we use it to
+ * return the index of the matching static entry.
+ *
+ * The gfs directory is expected to be locked by the caller prior to calling
+ * this function. The directory may be unlocked during the execution of
+ * this function, but will be locked upon return from the function.
+ *
+ * This function returns 0 if a match is found, ENOENT if not.
+ */
+static int
+gfs_dir_lookup_static(int (*compare)(const char *, const char *),
+ gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
+ vnode_t **vpp, pathname_t *rpnp)
+{
+ gfs_dirent_t *ge;
+ vnode_t *vp = NULL;
+ int i;
+
+ ASSERT(GFS_DIR_LOCKED(dp));
+
+ /*
* Search static entries.
*/
for (i = 0; i < dp->gfsd_nstatic; i++) {
ge = &dp->gfsd_static[i];
- if (strcmp(ge->gfse_name, nm) == 0) {
+ if (compare(ge->gfse_name, nm) == 0) {
+ if (rpnp)
+ (void) strlcpy(rpnp->pn_buf, ge->gfse_name,
+ rpnp->pn_bufsize);
+
if (ge->gfse_vnode) {
ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
vp = ge->gfse_vnode;
VN_HOLD(vp);
- goto out;
+ break;
}
/*
@@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
* need to do KM_SLEEP allocations. If we return from
* the constructor only to find that a parallel
* operation has completed, and GFS_CACHE_VNODE is set
- * for this entry, we discard the result in favor of the
- * cached vnode.
+ * for this entry, we discard the result in favor of
+ * the cached vnode.
*/
gfs_dir_unlock(dp);
vp = ge->gfse_ctor(dvp);
@@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
gfs_dir_lock(dp);
}
}
-
- goto out;
+ break;
}
}
- /*
- * See if there is a dynamic constructor.
- */
- if (dp->gfsd_lookup) {
- ino64_t ino;
- gfs_file_t *fp;
+ if (vp == NULL)
+ return (ENOENT);
+ else if (idx)
+ *idx = i;
+ *vpp = vp;
+ return (0);
+}
- /*
- * Once again, drop the directory lock, as the lookup routine
- * will need to allocate memory, or otherwise deadlock on this
- * directory.
- */
- gfs_dir_unlock(dp);
- ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
- gfs_dir_lock(dp);
- if (ret != 0)
- goto out;
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding
+ * vnode, if found.
+ *
+ * First, we search statically defined entries, if any, with a call to
+ * gfs_dir_lookup_static(). If no static entry is found, and we have
+ * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
+ *
+ * This function returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ int flags, int *direntflags, pathname_t *realpnp)
+{
+ gfs_dir_t *dp = dvp->v_data;
+ boolean_t casecheck;
+ vnode_t *dynvp = NULL;
+ vnode_t *vp = NULL;
+ int (*compare)(const char *, const char *);
+ int error, idx;
- fp = (gfs_file_t *)vp->v_data;
- fp->gfs_index = -1;
- fp->gfs_ino = ino;
- } else {
- /*
- * No static entry found, and there is no lookup callback, so
- * return ENOENT.
- */
- ret = ENOENT;
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+ return (0);
+
+ casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
+ if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
+ (flags & FIGNORECASE))
+ compare = strcasecmp;
+ else
+ compare = strcmp;
+
+ gfs_dir_lock(dp);
+
+ error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
+
+ if (vp && casecheck) {
+ gfs_dirent_t *ge;
+ int i;
+
+ for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (strcasecmp(ge->gfse_name, nm) == 0) {
+ *direntflags |= ED_CASE_CONFLICT;
+ goto out;
+ }
+ }
+ }
+
+ if ((error || casecheck) && dp->gfsd_lookup)
+ error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
+ &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
+
+ if (vp && dynvp) {
+ /* static and dynamic entries are case-insensitive conflict */
+ ASSERT(casecheck);
+ *direntflags |= ED_CASE_CONFLICT;
+ VN_RELE(dynvp);
+ } else if (vp == NULL) {
+ vp = dynvp;
+ } else if (error == ENOENT) {
+ error = 0;
+ } else if (error) {
+ VN_RELE(vp);
+ vp = NULL;
}
out:
gfs_dir_unlock(dp);
- if (ret == 0)
- *vpp = vp;
- else
- *vpp = NULL;
-
- return (ret);
+ *vpp = vp;
+ return (error);
}
/*
@@ -731,13 +921,15 @@ out:
* This is significantly more complex, thanks to the particulars of
* VOP_READDIR().
*
- * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- * offset_t *off, offset_t *nextoff, void *data)
+ * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+ * offset_t *off, offset_t *nextoff, void *data, int flags)
*
* vp - directory vnode
* dp - directory entry, sized according to maxlen given to
* gfs_dir_create(). callback must fill in d_name and
- * d_ino.
+ * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
+ * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
+ * is set in 'flags'.
* eofp - callback must set to 1 when EOF has been reached
* off - on entry, the last offset read from the directory. Callback
* must set to the offset of the current entry, typically left
@@ -745,12 +937,13 @@ out:
* nextoff - callback must set to offset of next entry. Typically
* (off + 1)
* data - caller-supplied data
+ * flags - VOP_READDIR flags
*
* Return 0 on success, or error on failure.
*/
int
gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
- u_long **cookies, void *data)
+ u_long **cookies, void *data, cred_t *cr, int flags)
{
gfs_readdir_state_t gstate;
int error, eof = 0;
@@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
offset_t off, next;
gfs_dir_t *dp = dvp->v_data;
- ino = dp->gfsd_file.gfs_ino;
-
- if (dp->gfsd_file.gfs_parent == NULL)
- pino = ino; /* root of filesystem */
- else
- pino = ((gfs_file_t *)
- (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
+ error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino);
+ if (error)
+ return (error);
if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
- pino, ino)) != 0)
+ pino, ino, flags)) != 0)
return (error);
while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
@@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
ino = dp->gfsd_inode(dvp, off);
if ((error = gfs_readdir_emit(&gstate, uiop,
- off, ino, dp->gfsd_static[off].gfse_name, ncookies,
- cookies)) != 0)
+ off, ino, dp->gfsd_static[off].gfse_name, 0,
+ ncookies, cookies)) != 0)
break;
} else if (dp->gfsd_readdir) {
@@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
if ((error = dp->gfsd_readdir(dvp,
gstate.grd_dirent, &eof, &off, &next,
- data)) != 0 || eof)
+ data, flags)) != 0 || eof)
break;
off += dp->gfsd_nstatic + 2;
@@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
}
/*
+ * gfs_vop_lookup: VOP_LOOKUP() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_lookup() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
+}
+
+/*
* gfs_vop_readdir: VOP_READDIR() entry point
*
* For use directly in vnode ops table. Given a GFS directory, calls
@@ -827,6 +1031,7 @@ gfs_vop_readdir(ap)
{
vnode_t *vp = ap->a_vp;
uio_t *uiop = ap->a_uio;
+ cred_t *cr = ap->a_cred;
int *eofp = ap->a_eofflag;
int ncookies = 0;
u_long *cookies = NULL;
@@ -842,7 +1047,8 @@ gfs_vop_readdir(ap)
*ap->a_ncookies = ncookies;
}
- error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
+ error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL,
+ cr, 0);
if (error == 0) {
/* Subtract unused cookies */
@@ -882,6 +1088,9 @@ gfs_vop_inactive(ap)
if (data != NULL)
kmem_free(data, fp->gfs_size);
+
+ VI_LOCK(vp);
vp->v_data = NULL;
+ VI_UNLOCK(vp);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..00a10aae8ec9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+ bzero(xvap, sizeof (xvattr_t));
+ xvap->xva_mapsize = XVA_MAPSIZE;
+ xvap->xva_magic = XVA_MAGIC;
+ xvap->xva_vattr.va_mask = AT_XVATTR;
+ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure. Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ if (xvap->xva_vattr.va_mask & AT_XVATTR)
+ xoap = &xvap->xva_xoptattrs;
+ return (xoap);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 420f802f360d..7ca528033c4f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DVA-based Adjustable Replacement Cache
*
@@ -47,13 +45,13 @@
* There are times when it is not possible to evict the requested
* space. In these circumstances we are unable to adjust the cache
* size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
*
* 2. The Megiddo and Modha model assumes a fixed cache size.
* Pages are evicted when the cache is full and there is a cache
* miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory preasure from the
+ * high use, but also tries to react to memory pressure from the
* operating system: decreasing its size when system memory is
* tight.
*
@@ -75,7 +73,7 @@
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() inerface
+ * or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
* adjusting the cache use method 2. We therefor provide two
* types of locks: 1) the hash table lock array, and 2) the
@@ -109,6 +107,14 @@
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
*/
#include <sys/spa.h>
@@ -117,6 +123,7 @@
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
+#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan;
static int arc_dead;
/*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
* These tunables are for performance analysis.
*/
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+int zfs_mdcomp_disable = 0;
+
+TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
+TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
"Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
"Minimum ARC size");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
- * Note that buffers can be on one of 5 states:
+ * Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc. These are the
- * only buffers that can be evicted or deleted.
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
*
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
@@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
* they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
* as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
*/
typedef struct arc_state {
- list_t arcs_list; /* linked list of evictable buffer in state */
- uint64_t arcs_lsize; /* total size of buffers in the linked list */
- uint64_t arcs_size; /* total size of all buffers in this state */
+ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
+ uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
} arc_state_t;
-/* The 5 states: */
+/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
static arc_state_t ARC_mru_ghost;
static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
@@ -222,6 +259,24 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_memory_throttle_count;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -252,7 +307,25 @@ static arc_stats_t arc_stats = {
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 }
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -299,6 +372,7 @@ static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
/*
* There are several ARC variables that are critical to export as kstats --
@@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost;
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
+static uint64_t arc_meta_used;
+static uint64_t arc_meta_limit;
+static uint64_t arc_meta_max = 0;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
+ &arc_meta_used, 0, "ARC metadata used");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
+ &arc_meta_limit, 0, "ARC metadata limit");
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
- arc_byteswap_func_t *acb_byteswap;
arc_buf_t *acb_buf;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
@@ -368,6 +450,9 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
};
static arc_buf_t *arc_eviction_list;
@@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
/*
* Private ARC flags. These flags are private ARC only flags that will show up
@@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
+#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
+ (hdr)->b_l2hdr != NULL)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
/*
* Hash table routines
@@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table;
uint64_t zfs_crc64_table[256];
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_FEED_SECS 1 /* caching interval */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_boost; /* warmup write boost, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ daddr_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
{
uintptr_t spav = (uintptr_t)spa;
uint8_t *vdva = (uint8_t *)dva;
@@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
bzero(buf, sizeof (arc_buf_hdr_t));
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
return (0);
}
@@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
+ mutex_destroy(&buf->b_freeze_lock);
+
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ rw_destroy(&buf->b_lock);
}
/*
@@ -639,7 +854,7 @@ retry:
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
@@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
static void
-arc_cksum_compute(arc_buf_t *buf)
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
mutex_enter(&buf->b_hdr->b_freeze_lock);
@@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf)
void
arc_buf_thaw(arc_buf_t *buf)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
- if (buf->b_hdr->b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
mutex_enter(&buf->b_hdr->b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
@@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf)
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
}
static void
@@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
uint64_t delta = ab->b_size * ab->b_datacnt;
+ list_t *list = &ab->b_state->arcs_list[ab->b_type];
+ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
mutex_enter(&ab->b_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&ab->b_state->arcs_list, ab);
+ list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
ASSERT3U(ab->b_datacnt, ==, 0);
ASSERT3P(ab->b_buf, ==, NULL);
delta = ab->b_size;
}
ASSERT(delta > 0);
- ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
- atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
mutex_exit(&ab->b_state->arcs_mtx);
- /* remove the prefetch flag is we get a reference */
+ /* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
}
@@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
+ uint64_t *size = &state->arcs_lsize[ab->b_type];
+
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list, ab);
+ list_insert_head(&state->arcs_list[ab->b_type], ab);
ASSERT(ab->b_datacnt > 0);
- atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
- ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+ atomic_add_64(size, ab->b_size * ab->b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
@@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
if (refcnt == 0) {
if (old_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ uint64_t *size = &old_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list, ab);
+ list_remove(&old_state->arcs_list[ab->b_type], ab);
/*
* If prefetching out of the ghost cache,
@@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
from_delta = ab->b_size;
}
- ASSERT3U(old_state->arcs_lsize, >=, from_delta);
- atomic_add_64(&old_state->arcs_lsize, -from_delta);
+ ASSERT3U(*size, >=, from_delta);
+ atomic_add_64(size, -from_delta);
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
}
if (new_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ uint64_t *size = &new_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
- list_insert_head(&new_state->arcs_list, ab);
+ list_insert_head(&new_state->arcs_list[ab->b_type], ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
to_delta = ab->b_size;
}
- atomic_add_64(&new_state->arcs_lsize, to_delta);
- ASSERT3U(new_state->arcs_size + to_delta, >=,
- new_state->arcs_lsize);
+ atomic_add_64(size, to_delta);
if (use_mutex)
mutex_exit(&new_state->arcs_mtx);
@@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && old_state != arc_anon) {
+ if (new_state == arc_anon) {
buf_hash_remove(ab);
}
@@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(&old_state->arcs_size, -from_delta);
}
ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space)
+{
+ atomic_add_64(&arc_meta_used, space);
+ atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space)
+{
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ atomic_add_64(&arc_meta_used, -space);
+ ASSERT(arc_size >= space);
+ atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+ if (arc_evict_needed(ARC_BUFC_DATA))
+ cv_signal(&arc_reclaim_thr_cv);
+ atomic_add_64(&arc_size, size);
+ return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+ zio_data_buf_free(buf, size);
+ ASSERT(arc_size >= size);
+ atomic_add_64(&arc_size, -size);
}
arc_buf_t *
@@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
hdr->b_spa = spa;
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
kmutex_t *hash_lock;
/*
- * Check to see if this buffer is currently being evicted via
- * arc_do_user_evicts().
+ * Check to see if this buffer is evicted. Callers
+ * must verify b_data != NULL to know if the add_ref
+ * was successful.
*/
- mutex_enter(&arc_eviction_mtx);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- mutex_exit(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_READER);
+ if (buf->b_data == NULL) {
+ rw_exit(&buf->b_lock);
return;
}
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
mutex_enter(hash_lock);
- if (buf->b_data == NULL) {
- /*
- * This buffer is evicted.
- */
- mutex_exit(hash_lock);
- return;
- }
+ rw_exit(&buf->b_lock);
- ASSERT(buf->b_hdr == hdr);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
arc_access(hdr, hash_lock);
@@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
data, metadata, hits);
}
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
static void
arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
{
@@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_cksum_verify(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
+ arc_space_return(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
+ atomic_add_64(&arc_size, -size);
}
- atomic_add_64(&arc_size, -size);
}
if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ uint64_t *cnt = &state->arcs_lsize[type];
+
ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
ASSERT(state != arc_anon);
- ASSERT3U(state->arcs_lsize, >=, size);
- atomic_add_64(&state->arcs_lsize, -size);
+
+ ASSERT3U(*cnt, >=, size);
+ atomic_add_64(cnt, -size);
}
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
@@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!(hdr->b_flags & ARC_STORED));
+
+ if (hdr->b_l2hdr != NULL) {
+ if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ if (hdr->b_l2hdr != NULL) {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
+ hdr);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+ } else {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ }
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
@@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- mutex_destroy(&hdr->b_freeze_lock);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf)
* - return the data block from this buffer rather than freeing it.
* This flag is used by callers that are trying to make space for a
* new buffer in a full arc cache.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
*/
static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type)
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
arc_buf_hdr_t *ab, *ab_prev = NULL;
+ list_t *list = &state->arcs_list[type];
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
@@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
+ (spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
skipped++;
@@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
+ if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ missed += 1;
+ break;
+ }
if (buf->b_data) {
bytes_evicted += ab->b_size;
if (recycle && ab->b_type == type &&
- ab->b_size == bytes) {
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
stolen = buf->b_data;
recycle = FALSE;
}
@@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
} else {
+ rw_exit(&buf->b_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
}
- ASSERT(ab->b_datacnt == 0);
- arc_change_state(evicted_state, ab, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags = ARC_IN_HASH_TABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (ab->b_datacnt == 0) {
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ }
if (!have_lock)
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
@@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
if (missed)
ARCSTAT_INCR(arcstat_mutex_miss, missed);
+ /*
+ * We have just evicted some date into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
+ */
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+
return (stolen);
}
@@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
@@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes)
ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ if (spa && ab->b_spa != spa)
+ continue;
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += ab->b_size;
- arc_hdr_destroy(ab);
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
@@ -1256,6 +1630,12 @@ top:
}
mutex_exit(&state->arcs_mtx);
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_deleted < bytes)) {
+ list = &state->arcs_list[ARC_BUFC_METADATA];
+ goto top;
+ }
+
if (bufs_skipped) {
ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
ASSERT(bytes >= 0);
@@ -1271,38 +1651,58 @@ arc_adjust(void)
{
int64_t top_sz, mru_over, arc_over, todelete;
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
- if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
- (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
}
mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
if (mru_over > 0) {
- if (arc_mru_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
- arc_evict_ghost(arc_mru_ghost, todelete);
+ if (arc_mru_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
}
}
if ((arc_over = arc_size - arc_c) > 0) {
int64_t tbl_over;
- if (arc_mfu->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
- (void) arc_evict(arc_mfu, toevict, FALSE,
- ARC_BUFC_UNDEF);
+ if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_DATA);
+ arc_over = arc_size - arc_c;
+ }
+
+ if (arc_over > 0 &&
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
+ arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
}
- tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
- arc_mfu_ghost->arcs_lsize - arc_c*2;
+ tbl_over = arc_size + arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c * 2;
- if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
- arc_evict_ghost(arc_mfu_ghost, todelete);
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
}
@@ -1314,7 +1714,9 @@ arc_do_user_evicts(void)
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
arc_eviction_list = buf->b_next;
+ rw_enter(&buf->b_lock, RW_WRITER);
buf->b_hdr = NULL;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
if (buf->b_efunc != NULL)
@@ -1329,24 +1731,40 @@ arc_do_user_evicts(void)
}
/*
- * Flush all *evictable* data from the cache.
+ * Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
void
-arc_flush(void)
+arc_flush(spa_t *spa)
{
- while (list_head(&arc_mru->arcs_list))
- (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
- while (list_head(&arc_mfu->arcs_list))
- (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
- arc_evict_ghost(arc_mru_ghost, -1);
- arc_evict_ghost(arc_mfu_ghost, -1);
+ arc_evict_ghost(arc_mru_ghost, spa, -1);
+ arc_evict_ghost(arc_mfu_ghost, spa, -1);
mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
mutex_exit(&arc_reclaim_thr_lock);
- ASSERT(arc_eviction_list == NULL);
+ ASSERT(spa || arc_eviction_list == NULL);
}
int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
@@ -1380,7 +1798,7 @@ arc_shrink(void)
arc_adjust();
}
-static int zfs_needfree = 0;
+static int needfree = 0;
static int
arc_reclaim_needed(void)
@@ -1391,13 +1809,28 @@ arc_reclaim_needed(void)
#ifdef _KERNEL
- if (zfs_needfree)
+ if (needfree)
return (1);
#if 0
/*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
* check to make sure that swapfs has enough space so that anon
- * reservations can still succeeed. anon_resvmem() checks that the
+ * reservations can still succeed. anon_resvmem() checks that the
* availrmem is greater than swapfs_minfree, and the number of reserved
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
@@ -1405,23 +1838,6 @@ arc_reclaim_needed(void)
if (availrmem < swapfs_minfree + swapfs_reserve + extra)
return (1);
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then check that the size of available vmem for this area remains
- * above 1/4th free. This needs to be done when the size of the
- * non-default segment is smaller than physical memory, so we could
- * conceivably run out of VA in that segment before running out of
- * physical memory.
- */
- if (zio_arena != NULL) {
- size_t arc_ziosize =
- btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
- if ((physmem > arc_ziosize) &&
- (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
- return (1);
- }
-
#if defined(__i386)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
@@ -1431,7 +1847,7 @@ arc_reclaim_needed(void)
* can have in the system. However, this is generally fixed at 25 pages
* which is so low that it's useless. In this comparison, we seek to
* calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
if (btop(vmem_size(heap_arena, VMEM_FREE)) <
@@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
#ifdef _KERNEL
- /*
- * First purge some DNLC entries, in case the DNLC is using
- * up too much memory.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
+ if (arc_meta_used >= arc_meta_limit) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
#if defined(__i386)
/*
* Reclaim unused memory from all kmem caches.
@@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
/*
- * An agressive reclamation will shrink the cache size as well as
+ * An aggressive reclamation will shrink the cache size as well as
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
@@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused)
/* reset the growth delay for every reclaim */
growtime = LBOLT + (arc_grow_retry * hz);
- ASSERT(growtime > 0);
- if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+ if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
/*
- * If zfs_needfree is TRUE our vm_lowmem hook
+ * If needfree is TRUE our vm_lowmem hook
* was called and in that case we must free some
* memory, so switch to aggressive mode.
*/
@@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused)
last_reclaim = ARC_RECLAIM_AGGR;
}
arc_kmem_reap_now(last_reclaim);
- } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
+ arc_warm = B_TRUE;
+
+ } else if (arc_no_grow && LBOLT >= growtime) {
arc_no_grow = FALSE;
}
- if (zfs_needfree ||
+ if (needfree ||
(2 * arc_c < arc_size +
arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
arc_adjust();
@@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused)
arc_do_user_evicts();
if (arc_reclaim_needed()) {
- zfs_needfree = 0;
+ needfree = 0;
#ifdef _KERNEL
- wakeup(&zfs_needfree);
+ wakeup(&needfree);
#endif
}
@@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
+ if (state == arc_l2c_only)
+ return;
+
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
@@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state)
* prior to insert.
*/
static int
-arc_evict_needed()
+arc_evict_needed(arc_buf_contents_t type)
{
+ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+ return (1);
+
+#if 0
+#ifdef _KERNEL
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this area remains
+ * above about 1/32nd free.
+ */
+ if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+ vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+ return (1);
+#endif
+#endif
+
if (arc_reclaim_needed())
return (1);
@@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf)
* We have not yet reached cache maximum size,
* just allocate a new buffer.
*/
- if (!arc_evict_needed()) {
+ if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
goto out;
}
@@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf)
if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+ state = (arc_mfu->arcs_lsize[type] > 0 &&
+ arc_p > mru_used) ? arc_mfu : arc_mru;
} else {
/* MFU cases */
uint64_t mfu_space = arc_c - arc_p;
- state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ state = (arc_mru->arcs_lsize[type] > 0 &&
+ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
@@ -1728,7 +2170,7 @@ out:
atomic_add_64(&hdr->b_state->arcs_size, size);
if (list_link_active(&hdr->b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize, size);
+ atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
}
/*
* If we are growing the cache, and we are adding anonymous
@@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
if (refcount_count(&buf->b_refcnt) == 0) {
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mru->arcs_mtx);
- list_remove(&arc_mru->arcs_list, buf);
- list_insert_head(&arc_mru->arcs_list, buf);
- mutex_exit(&arc_mru->arcs_mtx);
} else {
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
@@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mfu->arcs_mtx);
- list_remove(&arc_mfu->arcs_list, buf);
- list_insert_head(&arc_mfu->arcs_list, buf);
- mutex_exit(&arc_mfu->arcs_mtx);
}
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = LBOLT;
@@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = LBOLT;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
@@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_done_func_t */
void
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
@@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio)
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~ARC_L2_EVICTED;
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags &= ~ARC_L2CACHE;
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
- callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+ byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+ func(buf->b_data, hdr->b_size);
+ }
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
/* create copies of the data buffer for the callers */
abuf = buf;
@@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
- /* convert checksum errors into IO errors */
- if (zio->io_error == ECKSUM)
- zio->io_error = EIO;
}
/*
@@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio)
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp. But if you know you don't need locking, you can use
+ * arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t *arc_flags, zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ int err;
+ arc_buf_hdr_t *hdr = pbuf->b_hdr;
+
+ ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+ ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+ rw_enter(&pbuf->b_lock, RW_READER);
+
+ err = arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb);
+
+ ASSERT3P(hdr, ==, pbuf->b_hdr);
+ rw_exit(&pbuf->b_lock);
+ return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
kmutex_t *hash_lock;
- zio_t *rzio;
+ zio_t *rzio;
top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
@@ -2053,10 +2525,9 @@ top:
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, flags);
+ spa, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
acb->acb_next = hdr->b_acb;
@@ -2093,6 +2564,8 @@ top:
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2104,6 +2577,8 @@ top:
} else {
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ daddr_t addr;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -2130,6 +2605,8 @@ top:
private);
hdr->b_flags |= ARC_PREFETCH;
}
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_INDIRECT;
} else {
@@ -2144,7 +2621,9 @@ top:
hdr->b_flags |= ARC_PREFETCH;
else
add_reference(hdr, hash_lock, private);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -2160,7 +2639,6 @@ top:
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
ASSERT(hdr->b_acb == NULL);
hdr->b_acb = acb;
@@ -2176,6 +2654,18 @@ top:
if (GHOST_STATE(hdr->b_state))
arc_access(hdr, hash_lock);
+
+ if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
+ (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ addr = hdr->b_l2hdr->b_daddr;
+ /*
+ * Lock out device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
@@ -2186,8 +2676,65 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
+ if (vd != NULL) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ */
+ if (hdr->b_l2hdr != NULL &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority, zio_flags |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+
+ if (*arc_flags & ARC_NOWAIT) {
+ zio_nowait(rzio);
+ return (0);
+ }
+
+ ASSERT(*arc_flags & ARC_WAIT);
+ if (zio_wait(rzio) == 0)
+ return (0);
+
+ /* l2arc read error; goto zio_read() */
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ }
+
rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, flags, zb);
+ arc_read_done, buf, priority, zio_flags, zb);
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf)
kmutex_t *hash_lock;
arc_buf_t **bufp;
- mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
return (0);
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
- mutex_enter(hash_lock);
-
- if (buf->b_data == NULL) {
+ } else if (buf->b_data == NULL) {
+ arc_buf_t copy = *buf; /* structure assignment */
/*
- * We are on the eviction list.
+ * We are on the eviction list; process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
*/
- mutex_exit(hash_lock);
- mutex_enter(&arc_eviction_mtx);
- if (buf->b_hdr == NULL) {
- /*
- * We are already in arc_do_user_evicts().
- */
- mutex_exit(&arc_eviction_mtx);
- return (0);
- } else {
- arc_buf_t copy = *buf; /* structure assignment */
- /*
- * Process this buffer now
- * but let arc_do_user_evicts() do the reaping.
- */
- buf->b_efunc = NULL;
- mutex_exit(&arc_eviction_mtx);
- VERIFY(copy.b_efunc(&copy) == 0);
- return (1);
- }
+ buf->b_efunc = NULL;
+ rw_exit(&buf->b_lock);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
}
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
@@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf)
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags = ARC_IN_HASH_TABLE;
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&old_state->arcs_mtx);
}
mutex_exit(hash_lock);
+ rw_exit(&buf->b_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
@@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf)
* Release this buffer from the cache. This must be done
* after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
+ * a new hdr for the buffer.
*/
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ l2arc_buf_hdr_t *l2hdr;
+ uint64_t buf_size;
+
+ rw_enter(&buf->b_lock, RW_WRITER);
+ hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
@@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf);
+ rw_exit(&buf->b_lock);
return;
}
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ l2hdr = hdr->b_l2hdr;
+ if (l2hdr) {
+ mutex_enter(&l2arc_buflist_mtx);
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+
/*
* Do we have more than one buf?
*/
- if (hdr->b_buf != buf || buf->b_next != NULL) {
+ if (hdr->b_datacnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
- ASSERT(hdr->b_datacnt > 1);
+ ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
* Pull the data off of this buf and attach it to
* a new anonymous buf.
@@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
if (refcount_is_zero(&hdr->b_refcnt)) {
- ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3U(*size, >=, hdr->b_size);
+ atomic_add_64(size, -hdr->b_size);
}
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
nhdr->b_type = type;
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
- nhdr->b_flags = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
nhdr->b_freeze_cksum = NULL;
- mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
+ rw_exit(&buf->b_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
-
- hdr = nhdr;
} else {
+ rw_exit(&buf->b_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
mutex_exit(hash_lock);
+
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
@@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag)
}
buf->b_efunc = NULL;
buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ mutex_exit(&l2arc_buflist_mtx);
+ }
}
int
arc_released(arc_buf_t *buf)
{
- return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ int released;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ rw_exit(&buf->b_lock);
+ return (released);
}
int
arc_has_callback(arc_buf_t *buf)
{
- return (buf->b_efunc != NULL);
+ int callback;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ callback = (buf->b_efunc != NULL);
+ rw_exit(&buf->b_lock);
+ return (callback);
}
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
{
- return (refcount_count(&buf->b_hdr->b_refcnt));
+ int referenced;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ rw_exit(&buf->b_lock);
+ return (referenced);
}
#endif
@@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- if (callback->awcb_ready) {
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
- callback->awcb_ready(zio, buf, callback->awcb_private);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum.
+ * It is the responsibility of the callback to handle the
+ * accounting for any re-write attempt.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
}
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
static void
@@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio)
hdr->b_acb = NULL;
- /* this buffer is on no lists and is not in the hash table */
- ASSERT3P(hdr->b_state, ==, arc_anon);
-
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = zio->io_bp->blk_birth;
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio)
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
BP_IDENTITY(zio->io_bp)));
ASSERT3U(zio->io_bp_orig.blk_birth, ==,
@@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio)
ASSERT3P(exists, ==, NULL);
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- arc_access(hdr, hash_lock);
+ /* if it's not anon, we are doing a scrub */
+ if (hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else if (callback->awcb_done == NULL) {
int destroy_hdr;
@@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio)
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
+ hdr->b_flags &= ~ARC_STORED;
if (callback->awcb_done) {
ASSERT(!refcount_is_zero(&hdr->b_refcnt));
@@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio)
kmem_free(callback, sizeof (arc_write_callback_t));
}
+static void
+write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
+{
+ boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
+
+ /* Determine checksum setting */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
+ !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
+ zp->zp_checksum = wp->wp_oschecksum;
+ else
+ zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
+ wp->wp_oschecksum);
+ }
+
+ /* Determine compression setting */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ zp->zp_compress = zio_compress_select(wp->wp_dncompress,
+ wp->wp_oscompress);
+ }
+
+ zp->zp_type = wp->wp_type;
+ zp->zp_level = wp->wp_level;
+ zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
+}
+
zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+ boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
- zio_t *zio;
+ zio_t *zio;
+ zio_prop_t zp;
- /* this is a private buffer - no locking required */
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(ready != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
ASSERT(hdr->b_acb == 0);
+ if (l2arc)
+ hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
- zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
- buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
- priority, flags, zb);
+
+ write_policy(spa, wp, &zp);
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
@@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
* nonzero, it should match what we have in the cache.
*/
ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
+ bp->blk_fill == BLK_FILL_ALREADY_FREED);
+
if (ab->b_state != arc_anon)
arc_change_state(arc_anon, ab, hash_lock);
if (HDR_IO_IN_PROGRESS(ab)) {
@@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
} else if (refcount_is_zero(&ab->b_refcnt)) {
+ ab->b_flags |= ARC_FREE_IN_PROGRESS;
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
ARCSTAT_BUMP(arcstat_deleted);
@@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
}
- zio = zio_free(pio, spa, txg, bp, done, private);
+ zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
if (arc_flags & ARC_WAIT)
return (zio_wait(zio));
@@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
return (0);
}
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t inflight_data = arc_anon->arcs_size;
+ uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
+ static uint64_t page_load = 0;
+ static uint64_t last_txg = 0;
+
+#if 0
+#if defined(__i386)
+ available_memory =
+ MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+#endif
+ if (available_memory >= zfs_write_limit_max)
+ return (0);
+
+ if (txg > last_txg) {
+ last_txg = txg;
+ page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == pageproc) {
+ if (page_load > available_memory / 4)
+ return (ERESTART);
+ /* Note: reserve is inflated, so we deflate */
+ page_load += reserve / 8;
+ return (0);
+ } else if (page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (EAGAIN);
+ }
+ page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (ERESTART);
+ }
+#endif
+ return (0);
+}
+
void
-arc_tempreserve_clear(uint64_t tempreserve)
+arc_tempreserve_clear(uint64_t reserve)
{
- atomic_add_64(&arc_tempreserve, -tempreserve);
+ atomic_add_64(&arc_tempreserve, -reserve);
ASSERT((int64_t)arc_tempreserve >= 0);
}
int
-arc_tempreserve_space(uint64_t tempreserve)
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
+ int error;
+
#ifdef ZFS_DEBUG
/*
* Once in a while, fail for no reason. Everything should cope.
@@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve)
return (ERESTART);
}
#endif
- if (tempreserve > arc_c/4 && !arc_no_grow)
- arc_c = MIN(arc_c_max, tempreserve * 4);
- if (tempreserve > arc_c)
+ if (reserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, reserve * 4);
+ if (reserve > arc_c)
return (ENOMEM);
/*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefor need to
+ * make sure that there is sufficient available memory for this.
+ */
+ if (error = arc_memory_throttle(reserve, txg))
+ return (error);
+
+ /*
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
- *
- * XXX The limit should be adjusted dynamically to keep the time
- * to sync a dataset fixed (around 1-5 seconds?).
*/
-
- if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
- dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
- "tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
- tempreserve>>10, arc_c>>10);
+ if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+ reserve>>10, arc_c>>10);
return (ERESTART);
}
- atomic_add_64(&arc_tempreserve, tempreserve);
+ atomic_add_64(&arc_tempreserve, reserve);
return (0);
}
@@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused)
/* Serialize access via arc_lowmem_lock. */
mutex_enter(&arc_lowmem_lock);
- zfs_needfree = 1;
+ needfree = 1;
cv_signal(&arc_reclaim_thr_cv);
- while (zfs_needfree)
- tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+ while (needfree)
+ tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
mutex_exit(&arc_lowmem_lock);
}
#endif
@@ -2743,6 +3442,16 @@ arc_init(void)
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -2757,6 +3466,7 @@ arc_init(void)
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -2764,15 +3474,28 @@ arc_init(void)
mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
buf_init();
@@ -2798,6 +3521,13 @@ arc_init(void)
#endif
arc_dead = FALSE;
+ arc_warm = B_FALSE;
+
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
#ifdef _KERNEL
/* Warn about ZFS memory and address space requirements. */
@@ -2808,9 +3538,9 @@ arc_init(void)
if (kmem_size() < 512 * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
"expect unstable behavior.\n");
- printf(" Consider tuning vm.kmem_size and "
+ printf(" Consider tuning vm.kmem_size and "
"vm.kmem_size_max\n");
- printf(" in /boot/loader.conf.\n");
+ printf(" in /boot/loader.conf.\n");
}
#endif
}
@@ -2818,6 +3548,7 @@ arc_init(void)
void
arc_fini(void)
{
+
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
cv_signal(&arc_reclaim_thr_cv);
@@ -2825,7 +3556,7 @@ arc_fini(void)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
mutex_exit(&arc_reclaim_thr_lock);
- arc_flush();
+ arc_flush(NULL);
arc_dead = TRUE;
@@ -2838,10 +3569,14 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc_mru->arcs_list);
- list_destroy(&arc_mru_ghost->arcs_list);
- list_destroy(&arc_mfu->arcs_list);
- list_destroy(&arc_mfu_ghost->arcs_list);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
mutex_destroy(&arc_anon->arcs_mtx);
mutex_destroy(&arc_mru->arcs_mtx);
@@ -2849,6 +3584,8 @@ arc_fini(void)
mutex_destroy(&arc_mfu->arcs_mtx);
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ mutex_destroy(&zfs_write_limit_lock);
+
buf_fini();
mutex_destroy(&arc_lowmem_lock);
@@ -2857,3 +3594,985 @@ arc_fini(void)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
#endif
}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev));
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev))
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ l2arc_buf_hdr_t *abl2;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, ab);
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));</