aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cddl/compat/opensolaris/include/fcntl.h3
-rw-r--r--cddl/compat/opensolaris/include/mnttab.h4
-rw-r--r--cddl/compat/opensolaris/include/priv.h2
-rw-r--r--cddl/compat/opensolaris/include/sha2.h38
-rw-r--r--cddl/compat/opensolaris/include/solaris.h6
-rw-r--r--cddl/compat/opensolaris/include/thread_pool.h39
-rw-r--r--cddl/compat/opensolaris/misc/fsshare.c14
-rw-r--r--cddl/compat/opensolaris/misc/zmount.c5
-rw-r--r--cddl/contrib/opensolaris/cmd/stat/common/statcommon.h50
-rw-r--r--cddl/contrib/opensolaris/cmd/stat/common/timestamp.c49
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.c2241
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb_il.c131
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs.8580
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c29
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h1
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs_main.c3667
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs_util.h6
-rw-r--r--cddl/contrib/opensolaris/cmd/zinject/translate.c55
-rw-r--r--cddl/contrib/opensolaris/cmd/zinject/zinject.c255
-rw-r--r--cddl/contrib/opensolaris/cmd/zinject/zinject.h9
-rw-r--r--cddl/contrib/opensolaris/cmd/zlook/zlook.c411
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool.8684
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_main.c977
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_util.c20
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_util.h8
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c471
-rw-r--r--cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.167
-rw-r--r--cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c429
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c4851
-rw-r--r--cddl/contrib/opensolaris/head/synch.h27
-rw-r--r--cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c783
-rw-r--r--cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h160
-rw-r--r--cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h13
-rw-r--r--cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c41
-rw-r--r--cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c33
-rw-r--r--cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c56
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h211
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c94
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c22
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c1503
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c832
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c452
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h53
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c598
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c354
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c1533
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c1346
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c99
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c185
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/kernel.c128
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h109
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/taskq.c49
-rw-r--r--cddl/contrib/opensolaris/lib/libzpool/common/util.c5
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/__init__.py5
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/allow.py16
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/dataset.py37
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/groupspace.py5
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/holds.py75
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/ioctl.c117
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/table.py70
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/unallow.py5
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/userspace.py77
-rw-r--r--cddl/contrib/opensolaris/lib/pyzfs/common/util.py13
-rw-r--r--cddl/lib/libzfs/Makefile29
-rw-r--r--cddl/lib/libzpool/Makefile18
-rw-r--r--cddl/sbin/zfs/Makefile8
-rw-r--r--cddl/sbin/zpool/Makefile15
-rw-r--r--cddl/usr.bin/Makefile4
-rw-r--r--cddl/usr.bin/zlook/Makefile25
-rw-r--r--cddl/usr.bin/zstreamdump/Makefile27
-rw-r--r--cddl/usr.bin/ztest/Makefile1
-rw-r--r--cddl/usr.sbin/zdb/Makefile1
-rw-r--r--rescue/rescue/Makefile2
-rw-r--r--sys/boot/i386/gptzfsboot/Makefile1
-rw-r--r--sys/boot/i386/zfsboot/Makefile2
-rw-r--r--sys/boot/zfs/zfs.c40
-rw-r--r--sys/boot/zfs/zfsimpl.c211
-rw-r--r--sys/cddl/boot/zfs/fletcher.c34
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h246
-rw-r--r--sys/cddl/boot/zfs/zfssubr.c1683
-rw-r--r--sys/cddl/boot/zfs/zle.c54
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c24
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c2
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_misc.c38
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_policy.c197
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_string.c33
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c198
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c334
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c20
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c72
-rw-r--r--sys/cddl/compat/opensolaris/kern/opensolaris_zone.c9
-rw-r--r--sys/cddl/compat/opensolaris/sys/atomic.h18
-rw-r--r--sys/cddl/compat/opensolaris/sys/byteorder.h22
-rw-r--r--sys/cddl/compat/opensolaris/sys/dirent.h5
-rw-r--r--sys/cddl/compat/opensolaris/sys/file.h16
-rw-r--r--sys/cddl/compat/opensolaris/sys/kmem.h8
-rw-r--r--sys/cddl/compat/opensolaris/sys/misc.h11
-rw-r--r--sys/cddl/compat/opensolaris/sys/mount.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/mutex.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/policy.h58
-rw-r--r--sys/cddl/compat/opensolaris/sys/proc.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/rwlock.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/sid.h21
-rw-r--r--sys/cddl/compat/opensolaris/sys/stat.h20
-rw-r--r--sys/cddl/compat/opensolaris/sys/string.h4
-rw-r--r--sys/cddl/compat/opensolaris/sys/sunddi.h29
-rw-r--r--sys/cddl/compat/opensolaris/sys/sysmacros.h143
-rw-r--r--sys/cddl/compat/opensolaris/sys/systeminfo.h6
-rw-r--r--sys/cddl/compat/opensolaris/sys/systm.h3
-rw-r--r--sys/cddl/compat/opensolaris/sys/taskq.h2
-rw-r--r--sys/cddl/compat/opensolaris/sys/time.h13
-rw-r--r--sys/cddl/compat/opensolaris/sys/types.h3
-rw-r--r--sys/cddl/compat/opensolaris/sys/uio.h23
-rw-r--r--sys/cddl/compat/opensolaris/sys/vfs.h12
-rw-r--r--sys/cddl/compat/opensolaris/sys/vnode.h18
-rw-r--r--sys/cddl/compat/opensolaris/sys/zone.h7
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.c361
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.h12
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S20
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S6
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S11
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S13
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S2
-rw-r--r--sys/cddl/contrib/opensolaris/common/avl/avl.c7
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c60
-rw-r--r--sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c1
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c144
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h12
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c9
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h8
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c (renamed from sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c)1
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h53
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c349
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h223
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c2
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c233
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h18
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c62
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c153
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c733
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c316
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c495
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c962
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c1152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c911
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c245
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c1196
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c1030
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c244
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c373
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c683
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2080
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c474
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c281
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c376
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c866
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c1766
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c1060
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c61
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c344
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c1970
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c123
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c2904
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c150
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c454
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h60
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h246
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h208
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h100
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h62
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h265
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h72
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h38
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h142
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h116
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h134
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h330
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h182
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c175
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c990
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c174
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c137
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c1546
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c244
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c539
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c1290
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c234
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c247
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c691
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c64
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c2916
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c178
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c252
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c147
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c334
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c863
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c2628
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1065
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c1583
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c993
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c140
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c98
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c181
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c86
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c194
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c1967
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/callb.c76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/fm.c1402
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h170
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/callb.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h27
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cred.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/debug.h23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h375
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/processor.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h42
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h30
-rw-r--r--sys/modules/opensolaris/Makefile3
-rw-r--r--sys/modules/zfs/Makefile12
-rw-r--r--usr.bin/fstat/zfs.c1
304 files changed, 59466 insertions, 19882 deletions
diff --git a/cddl/compat/opensolaris/include/fcntl.h b/cddl/compat/opensolaris/include/fcntl.h
index 9b6c3f9ee62d..548918aaab3a 100644
--- a/cddl/compat/opensolaris/include/fcntl.h
+++ b/cddl/compat/opensolaris/include/fcntl.h
@@ -32,6 +32,7 @@
#include_next <fcntl.h>
-#define open64 open
+#define open64(...) open(__VA_ARGS__)
+#define openat64(...) openat(__VA_ARGS__)
#endif
diff --git a/cddl/compat/opensolaris/include/mnttab.h b/cddl/compat/opensolaris/include/mnttab.h
index a18dd8d1893b..227196a4017f 100644
--- a/cddl/compat/opensolaris/include/mnttab.h
+++ b/cddl/compat/opensolaris/include/mnttab.h
@@ -12,6 +12,10 @@
#define MNTTAB _PATH_DEVZERO
#define MNT_LINE_MAX 1024
+#define MS_OVERLAY 0x0
+#define MS_NOMNTTAB 0x0
+#define MS_RDONLY 0x1
+
#define umount2(p, f) unmount(p, f)
struct mnttab {
diff --git a/cddl/compat/opensolaris/include/priv.h b/cddl/compat/opensolaris/include/priv.h
index 32696ae5668b..2fee5b0d40c8 100644
--- a/cddl/compat/opensolaris/include/priv.h
+++ b/cddl/compat/opensolaris/include/priv.h
@@ -10,7 +10,7 @@
#define PRIV_SYS_CONFIG 0
static __inline int
-priv_ineffect(priv)
+priv_ineffect(int priv)
{
assert(priv == PRIV_SYS_CONFIG);
diff --git a/cddl/compat/opensolaris/include/sha2.h b/cddl/compat/opensolaris/include/sha2.h
new file mode 100644
index 000000000000..488f2dbd8b47
--- /dev/null
+++ b/cddl/compat/opensolaris/include/sha2.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SHA2_H_
+#define _OPENSOLARIS_SHA2_H_
+
+#include_next <sha256.h>
+
+#define SHA256Init(c) SHA256_Init(c)
+#define SHA256Update(c, d, s) SHA256_Update((c), (d), (s))
+#define SHA256Final(b, c) SHA256_Final((unsigned char *)(b), (c))
+
+#endif /* !_OPENSOLARIS_SHA2_H_ */
diff --git a/cddl/compat/opensolaris/include/solaris.h b/cddl/compat/opensolaris/include/solaris.h
index 01f9d4776abc..9bead018bcec 100644
--- a/cddl/compat/opensolaris/include/solaris.h
+++ b/cddl/compat/opensolaris/include/solaris.h
@@ -5,6 +5,10 @@
#include <sys/ccompile.h>
-#define dirent64 dirent
+#include <fcntl.h>
+
+#define NOTE(s)
+
+int mkdirp(const char *, mode_t);
#endif /* !_SOLARIS_H_ */
diff --git a/cddl/compat/opensolaris/include/thread_pool.h b/cddl/compat/opensolaris/include/thread_pool.h
new file mode 100644
index 000000000000..25ac55dedea7
--- /dev/null
+++ b/cddl/compat/opensolaris/include/thread_pool.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_THREAD_POOL_H_
+#define _OPENSOLARIS_THREAD_POOL_H_
+
+typedef int tpool_t;
+
+#define tpool_create(a, b, c, d) (0)
+#define tpool_dispatch(pool, func, arg) func(arg)
+#define tpool_wait(pool) do { } while (0)
+#define tpool_destroy(pool) do { } while (0)
+
+#endif /* !_OPENSOLARIS_THREAD_POOL_H_ */
diff --git a/cddl/compat/opensolaris/misc/fsshare.c b/cddl/compat/opensolaris/misc/fsshare.c
index 10ed591d5bca..e8faa928d6fb 100644
--- a/cddl/compat/opensolaris/misc/fsshare.c
+++ b/cddl/compat/opensolaris/misc/fsshare.c
@@ -28,15 +28,17 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <string.h>
+
+#include <assert.h>
#include <errno.h>
+#include <fcntl.h>
+#include <fsshare.h>
#include <libutil.h>
-#include <assert.h>
#include <pathnames.h> /* _PATH_MOUNTDPID */
-#include <fsshare.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
#define FILE_HEADER "# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n"
#define OPTSSIZE 1024
diff --git a/cddl/compat/opensolaris/misc/zmount.c b/cddl/compat/opensolaris/misc/zmount.c
index 493a4fc4ef12..b4f99e3be9fd 100644
--- a/cddl/compat/opensolaris/misc/zmount.c
+++ b/cddl/compat/opensolaris/misc/zmount.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <mnttab.h>
static void
build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val,
@@ -78,7 +79,7 @@ zmount(const char *spec, const char *dir, int mflag, char *fstype,
assert(spec != NULL);
assert(dir != NULL);
- assert(mflag == 0);
+ assert(mflag == 0 || mflag == MS_RDONLY);
assert(fstype != NULL);
assert(strcmp(fstype, MNTTYPE_ZFS) == 0);
assert(dataptr == NULL);
@@ -91,6 +92,8 @@ zmount(const char *spec, const char *dir, int mflag, char *fstype,
iov = NULL;
iovlen = 0;
+ if (mflag & MS_RDONLY)
+ build_iovec(&iov, &iovlen, "ro", NULL, 0);
build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1);
build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir),
(size_t)-1);
diff --git a/cddl/contrib/opensolaris/cmd/stat/common/statcommon.h b/cddl/contrib/opensolaris/cmd/stat/common/statcommon.h
new file mode 100644
index 000000000000..f82495f22b5f
--- /dev/null
+++ b/cddl/contrib/opensolaris/cmd/stat/common/statcommon.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Common routines for acquiring snapshots of kstats for
+ * iostat, mpstat, and vmstat.
+ */
+
+#ifndef _STATCOMMON_H
+#define _STATCOMMON_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <time.h>
+
+#define NODATE 0 /* Default: No time stamp */
+#define DDATE 1 /* Standard date format */
+#define UDATE 2 /* Internal representation of Unix time */
+
+/* Print a timestamp in either Unix or standard format. */
+void print_timestamp(uint_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _STATCOMMON_H */
diff --git a/cddl/contrib/opensolaris/cmd/stat/common/timestamp.c b/cddl/contrib/opensolaris/cmd/stat/common/timestamp.c
new file mode 100644
index 000000000000..be7b30c29fd0
--- /dev/null
+++ b/cddl/contrib/opensolaris/cmd/stat/common/timestamp.c
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "statcommon.h"
+
+#include <langinfo.h>
+
+/*
+ * Print timestamp as decimal reprentation of time_t value (-T u was specified)
+ * or in date(1) format (-T d was specified).
+ */
+void
+print_timestamp(uint_t timestamp_fmt)
+{
+ time_t t = time(NULL);
+
+ if (timestamp_fmt == UDATE) {
+ (void) printf("%ld\n", t);
+ } else if (timestamp_fmt == DDATE) {
+ char dstr[64];
+ int len;
+
+ len = strftime(dstr, sizeof (dstr), "%+", localtime(&t));
+ if (len > 0)
+ (void) printf("%s\n", dstr);
+ }
+}
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index 915ea1917a0e..c6e219df9e1d 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <stdio.h>
@@ -34,6 +33,9 @@
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab_impl.h>
@@ -51,10 +53,25 @@
#include <sys/zio_compress.h>
#include <sys/zfs_fuid.h>
#include <sys/arc.h>
+#include <sys/ddt.h>
#undef ZFS_MAXNAMELEN
#undef verify
#include <libzfs.h>
+#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
+ zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
+ zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
+ dmu_ot[(idx)].ot_name : "UNKNOWN")
+#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES)
+
+#ifndef lint
+extern int zfs_recover;
+#else
+int zfs_recover;
+#endif
+
const char cmdname[] = "zdb";
uint8_t dump_opt[256];
@@ -64,8 +81,6 @@ extern void dump_intent_log(zilog_t *);
uint64_t *zopt_object = NULL;
int zopt_objects = 0;
libzfs_handle_t *g_zfs;
-boolean_t zdb_sig_user_data = B_TRUE;
-int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -87,39 +102,56 @@ static void
usage(void)
{
(void) fprintf(stderr,
- "Usage: %s [-udibcsvL] [-U cachefile_path] [-t txg]\n"
- "\t [-S user:cksumalg] "
- "dataset [object...]\n"
- " %s -C [pool]\n"
- " %s -l dev\n"
- " %s -R pool:vdev:offset:size:flags\n"
- " %s [-p path_to_vdev_dir]\n"
- " %s -e pool | GUID | devid ...\n",
- cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
-
- (void) fprintf(stderr, " -u uberblock\n");
- (void) fprintf(stderr, " -d datasets\n");
- (void) fprintf(stderr, " -C cached pool configuration\n");
- (void) fprintf(stderr, " -i intent logs\n");
- (void) fprintf(stderr, " -b block statistics\n");
- (void) fprintf(stderr, " -m metaslabs\n");
- (void) fprintf(stderr, " -c checksum all metadata (twice for "
+ "Usage: %s [-CumdibcsDvhL] poolname [object...]\n"
+ " %s [-div] dataset [object...]\n"
+ " %s -m [-L] poolname [vdev [metaslab...]]\n"
+ " %s -R poolname vdev:offset:size[:flags]\n"
+ " %s -S poolname\n"
+ " %s -l [-u] device\n"
+ " %s -C\n\n",
+ cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+
+ (void) fprintf(stderr, " Dataset name must include at least one "
+ "separator character '/' or '@'\n");
+ (void) fprintf(stderr, " If dataset name is specified, only that "
+ "dataset is dumped\n");
+ (void) fprintf(stderr, " If object numbers are specified, only "
+ "those objects are dumped\n\n");
+ (void) fprintf(stderr, " Options to control amount of output:\n");
+ (void) fprintf(stderr, " -u uberblock\n");
+ (void) fprintf(stderr, " -d dataset(s)\n");
+ (void) fprintf(stderr, " -i intent logs\n");
+ (void) fprintf(stderr, " -C config (or cachefile if alone)\n");
+ (void) fprintf(stderr, " -h pool history\n");
+ (void) fprintf(stderr, " -b block statistics\n");
+ (void) fprintf(stderr, " -m metaslabs\n");
+ (void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
- (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
- (void) fprintf(stderr, " -S <user|all>:<cksum_alg|all> -- "
- "dump blkptr signatures\n");
- (void) fprintf(stderr, " -v verbose (applies to all others)\n");
+ (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
+ (void) fprintf(stderr, " -D dedup statistics\n");
+ (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
+ (void) fprintf(stderr, " -v verbose (applies to all others)\n");
(void) fprintf(stderr, " -l dump label contents\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
"load spacemaps)\n");
- (void) fprintf(stderr, " -U cachefile_path -- use alternate "
- "cachefile\n");
(void) fprintf(stderr, " -R read and display block from a "
- "device\n");
- (void) fprintf(stderr, " -e Pool is exported/destroyed/"
- "has altroot\n");
- (void) fprintf(stderr, " -p <Path to vdev dir> (use with -e)\n");
- (void) fprintf(stderr, " -t <txg> highest txg to use when "
+ "device\n\n");
+ (void) fprintf(stderr, " Below options are intended for use "
+ "with other options (except -l):\n");
+ (void) fprintf(stderr, " -A ignore assertions (-A), enable "
+ "panic recovery (-AA) or both (-AAA)\n");
+ (void) fprintf(stderr, " -F attempt automatic rewind within "
+ "safe range of transaction groups\n");
+ (void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
+ "cachefile\n");
+ (void) fprintf(stderr, " -X attempt extreme rewind (does not "
+ "work with dataset)\n");
+ (void) fprintf(stderr, " -e pool is exported/destroyed/"
+ "has altroot/not in a cachefile\n");
+ (void) fprintf(stderr, " -p <path> -- use one or more with "
+ "-e to specify path to vdev dir\n");
+ (void) fprintf(stderr, " -P print numbers parsable\n");
+ (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
@@ -146,68 +178,6 @@ fatal(const char *fmt, ...)
exit(1);
}
-static void
-dump_nvlist(nvlist_t *list, int indent)
-{
- nvpair_t *elem = NULL;
-
- while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
- switch (nvpair_type(elem)) {
- case DATA_TYPE_STRING:
- {
- char *value;
-
- VERIFY(nvpair_value_string(elem, &value) == 0);
- (void) printf("%*s%s='%s'\n", indent, "",
- nvpair_name(elem), value);
- }
- break;
-
- case DATA_TYPE_UINT64:
- {
- uint64_t value;
-
- VERIFY(nvpair_value_uint64(elem, &value) == 0);
- (void) printf("%*s%s=%llu\n", indent, "",
- nvpair_name(elem), (u_longlong_t)value);
- }
- break;
-
- case DATA_TYPE_NVLIST:
- {
- nvlist_t *value;
-
- VERIFY(nvpair_value_nvlist(elem, &value) == 0);
- (void) printf("%*s%s\n", indent, "",
- nvpair_name(elem));
- dump_nvlist(value, indent + 4);
- }
- break;
-
- case DATA_TYPE_NVLIST_ARRAY:
- {
- nvlist_t **value;
- uint_t c, count;
-
- VERIFY(nvpair_value_nvlist_array(elem, &value,
- &count) == 0);
-
- for (c = 0; c < count; c++) {
- (void) printf("%*s%s[%u]\n", indent, "",
- nvpair_name(elem), c);
- dump_nvlist(value[c], indent + 8);
- }
- }
- break;
-
- default:
-
- (void) printf("bad config type %d for %s\n",
- nvpair_type(elem), nvpair_name(elem));
- }
- }
-}
-
/* ARGSUSED */
static void
dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
@@ -227,6 +197,15 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
nvlist_free(nv);
}
+static void
+zdb_nicenum(uint64_t num, char *buf)
+{
+ if (dump_opt['P'])
+ (void) sprintf(buf, "%llu", (longlong_t)num);
+ else
+ nicenum(num, buf);
+}
+
const char dump_zap_stars[] = "****************************************";
const int dump_zap_width = sizeof (dump_zap_stars) - 1;
@@ -325,6 +304,13 @@ dump_none(objset_t *os, uint64_t object, void *data, size_t size)
}
/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ (void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
void
dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
{
@@ -388,6 +374,79 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
/*ARGSUSED*/
static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dump_zap_stats(os, object);
+ /* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ (void) printf(" %llx : [%d:%d:%d]\n",
+ (u_longlong_t)attr.za_first_integer,
+ (int)ATTR_LENGTH(attr.za_first_integer),
+ (int)ATTR_BSWAP(attr.za_first_integer),
+ (int)ATTR_NUM(attr.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ uint16_t *layout_attrs;
+ int i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = [", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+
+ VERIFY(attr.za_integer_length == 2);
+ layout_attrs = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+
+ VERIFY(zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length,
+ attr.za_num_integers, layout_attrs) == 0);
+
+ for (i = 0; i != attr.za_num_integers; i++)
+ (void) printf(" %d ", (int)layout_attrs[i]);
+ (void) printf("]\n");
+ umem_free(layout_attrs,
+ attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
@@ -441,17 +500,17 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
*/
alloc = 0;
for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
- VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+ VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset,
sizeof (entry), &entry, DMU_READ_PREFETCH));
if (SM_DEBUG_DECODE(entry)) {
- (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
+ (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
(u_longlong_t)(offset / sizeof (entry)),
ddata[SM_DEBUG_ACTION_DECODE(entry)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
} else {
- (void) printf("\t\t[%4llu] %c range:"
- " %08llx-%08llx size: %06llx\n",
+ (void) printf("\t [%6llu] %c range:"
+ " %010llx-%010llx size: %06llx\n",
(u_longlong_t)(offset / sizeof (entry)),
SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
@@ -476,14 +535,14 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
static void
dump_metaslab_stats(metaslab_t *msp)
{
- char maxbuf[5];
+ char maxbuf[32];
space_map_t *sm = &msp->ms_map;
avl_tree_t *t = sm->sm_pp_root;
int free_pct = sm->sm_space * 100 / sm->sm_size;
- nicenum(space_map_maxsize(sm), maxbuf);
+ zdb_nicenum(space_map_maxsize(sm), maxbuf);
- (void) printf("\t %20s %10lu %7s %6s %4s %4d%%\n",
+ (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
"segments", avl_numnodes(t), "maxsize", maxbuf,
"freepct", free_pct);
}
@@ -495,16 +554,16 @@ dump_metaslab(metaslab_t *msp)
spa_t *spa = vd->vdev_spa;
space_map_t *sm = &msp->ms_map;
space_map_obj_t *smo = &msp->ms_smo;
- char freebuf[5];
+ char freebuf[32];
- nicenum(sm->sm_size - smo->smo_alloc, freebuf);
+ zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf);
(void) printf(
- "\tvdev %5llu offset %12llx spacemap %6llu free %5s\n",
+ "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
(u_longlong_t)(sm->sm_start / sm->sm_size),
(u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
- if (dump_opt['m'] > 1) {
+ if (dump_opt['m'] > 1 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
space_map_load_wait(sm);
if (!sm->sm_loaded)
@@ -525,22 +584,52 @@ dump_metaslab(metaslab_t *msp)
}
static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+ (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
+ (u_longlong_t)vd->vdev_id,
+ "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+ "offset", "spacemap", "free");
+ (void) printf("\t%15s %19s %15s %10s\n",
+ "---------------", "-------------------",
+ "---------------", "-------------");
+}
+
+static void
dump_metaslabs(spa_t *spa)
{
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- int c, m;
+ vdev_t *vd, *rvd = spa->spa_root_vdev;
+ uint64_t m, c = 0, children = rvd->vdev_children;
(void) printf("\nMetaslabs:\n");
- for (c = 0; c < rvd->vdev_children; c++) {
- vd = rvd->vdev_child[c];
+ if (!dump_opt['d'] && zopt_objects > 0) {
+ c = zopt_object[0];
+
+ if (c >= children)
+ (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
- (void) printf("\t%-10s %-19s %-15s %-10s\n",
- "vdev", "offset", "spacemap", "free");
- (void) printf("\t%10s %19s %15s %10s\n",
- "----------", "-------------------",
- "---------------", "-------------");
+ if (zopt_objects > 1) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
+
+ for (m = 1; m < zopt_objects; m++) {
+ if (zopt_object[m] < vd->vdev_ms_count)
+ dump_metaslab(
+ vd->vdev_ms[zopt_object[m]]);
+ else
+ (void) fprintf(stderr, "bad metaslab "
+ "number %llu\n",
+ (u_longlong_t)zopt_object[m]);
+ }
+ (void) printf("\n");
+ return;
+ }
+ children = c + 1;
+ }
+ for (; c < children; c++) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
for (m = 0; m < vd->vdev_ms_count; m++)
dump_metaslab(vd->vdev_ms[m]);
@@ -549,6 +638,133 @@ dump_metaslabs(spa_t *spa)
}
static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+ const ddt_phys_t *ddp = dde->dde_phys;
+ const ddt_key_t *ddk = &dde->dde_key;
+ char *types[4] = { "ditto", "single", "double", "triple" };
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t blk;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ sprintf_blkptr(blkbuf, &blk);
+ (void) printf("index %llx refcnt %llu %s %s\n",
+ (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ types[p], blkbuf);
+ }
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+ double rL, rP, rD, D, dedup, compress, copies;
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ rL = (double)dds->dds_ref_lsize;
+ rP = (double)dds->dds_ref_psize;
+ rD = (double)dds->dds_ref_dsize;
+ D = (double)dds->dds_dsize;
+
+ dedup = rD / D;
+ compress = rL / rP;
+ copies = rD / rP;
+
+ (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ "dedup * compress / copies = %.2f\n\n",
+ dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ char name[DDT_NAMELEN];
+ ddt_entry_t dde;
+ uint64_t walk = 0;
+ dmu_object_info_t doi;
+ uint64_t count, dspace, mspace;
+ int error;
+
+ error = ddt_object_info(ddt, type, class, &doi);
+
+ if (error == ENOENT)
+ return;
+ ASSERT(error == 0);
+
+ if ((count = ddt_object_count(ddt, type, class)) == 0)
+ return;
+
+ dspace = doi.doi_physical_blocks_512 << 9;
+ mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ddt_object_name(ddt, type, class, name);
+
+ (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ name,
+ (u_longlong_t)count,
+ (u_longlong_t)(dspace / count),
+ (u_longlong_t)(mspace / count));
+
+ if (dump_opt['D'] < 3)
+ return;
+
+ zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+ if (dump_opt['D'] < 4)
+ return;
+
+ if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ return;
+
+ (void) printf("%s contents:\n\n", name);
+
+ while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+ dump_dde(ddt, &dde, walk);
+
+ ASSERT(error == ENOENT);
+
+ (void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ dump_ddt(ddt, type, class);
+ }
+ }
+ }
+
+ ddt_get_dedup_stats(spa, &dds_total);
+
+ if (dds_total.dds_blocks == 0) {
+ (void) printf("All DDTs are empty\n");
+ return;
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['D'] > 1) {
+ (void) printf("DDT histogram (aggregated over all DDTs):\n");
+ ddt_get_dedup_histogram(spa, &ddh_total);
+ zpool_dump_ddt(&dds_total, &ddh_total);
+ }
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static void
dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
{
char *prefix = (void *)sm;
@@ -568,7 +784,7 @@ dump_dtl(vdev_t *vd, int indent)
char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
char prefix[256];
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
required = vdev_dtl_required(vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
@@ -598,6 +814,68 @@ dump_dtl(vdev_t *vd, int indent)
dump_dtl(vd->vdev_child[c], indent + 4);
}
+static void
+dump_history(spa_t *spa)
+{
+ nvlist_t **events = NULL;
+ char buf[SPA_MAXBLOCKSIZE];
+ uint64_t resid, len, off = 0;
+ uint_t num = 0;
+ int error;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ char internalstr[MAXPATHLEN];
+
+ do {
+ len = sizeof (buf);
+
+ if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ (void) fprintf(stderr, "Unable to read history: "
+ "error %d\n", error);
+ return;
+ }
+
+ if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ break;
+
+ off -= resid;
+ } while (len != 0);
+
+ (void) printf("\nHistory:\n");
+ for (int i = 0; i < num; i++) {
+ uint64_t time, txg, ievent;
+ char *cmd, *intstr;
+
+ if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ &time) != 0)
+ continue;
+ if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ &cmd) != 0) {
+ if (nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ continue;
+ verify(nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_TXG, &txg) == 0);
+ verify(nvlist_lookup_string(events[i],
+ ZPOOL_HIST_INT_STR, &intstr) == 0);
+ if (ievent >= LOG_END)
+ continue;
+
+ (void) snprintf(internalstr,
+ sizeof (internalstr),
+ "[internal %s txg:%lld] %s",
+ zfs_history_event_names[ievent], txg,
+ intstr);
+ cmd = internalstr;
+ }
+ tsec = time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmd);
+ }
+}
+
/*ARGSUSED*/
static void
dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@ -605,35 +883,48 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
}
static uint64_t
-blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
{
- if (level < 0)
- return (blkid);
+ if (dnp == NULL) {
+ ASSERT(zb->zb_level < 0);
+ if (zb->zb_object == 0)
+ return (zb->zb_blkid);
+ return (zb->zb_blkid * BP_GET_LSIZE(bp));
+ }
- return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+ ASSERT(zb->zb_level >= 0);
+
+ return ((zb->zb_blkid <<
+ (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
}
static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
{
- dva_t *dva = bp->blk_dva;
- int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
- int i;
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+
+ if (dump_opt['b'] >= 5) {
+ sprintf_blkptr(blkbuf, bp);
+ return;
+ }
blkbuf[0] = '\0';
- for (i = 0; i < ndvas; i++)
+ for (int i = 0; i < ndvas; i++)
(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
- (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+ (void) sprintf(blkbuf + strlen(blkbuf),
+ "%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_birth);
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
static void
@@ -646,8 +937,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- (void) printf("%16llx ",
- (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+ (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
ASSERT(zb->zb_level >= 0);
@@ -659,23 +949,15 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
}
}
- sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ sprintf_blkptr_compact(blkbuf, bp);
(void) printf("%s\n", blkbuf);
}
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
static int
visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
blkptr_t *bp, const zbookmark_t *zb)
{
- int err;
+ int err = 0;
if (bp->blk_birth == 0)
return (0);
@@ -694,6 +976,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
+ ASSERT(buf->b_data);
/* recursively visit blocks below this */
cbp = buf->b_data;
@@ -726,11 +1009,11 @@ dump_indirect(dnode_t *dn)
(void) printf("Indirect blocks:\n");
- SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os),
+ SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
dn->dn_object, dnp->dn_nlevels - 1, 0);
for (j = 0; j < dnp->dn_nblkptr; j++) {
czb.zb_blkid = j;
- (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp,
+ (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
&dnp->dn_blkptr[j], &czb);
}
@@ -743,7 +1026,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dir_phys_t *dd = data;
time_t crtime;
- char nice[6];
+ char nice[32];
if (dd == NULL)
return;
@@ -760,15 +1043,15 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
(u_longlong_t)dd->dd_origin_obj);
(void) printf("\t\tchild_dir_zapobj = %llu\n",
(u_longlong_t)dd->dd_child_dir_zapobj);
- nicenum(dd->dd_used_bytes, nice);
+ zdb_nicenum(dd->dd_used_bytes, nice);
(void) printf("\t\tused_bytes = %s\n", nice);
- nicenum(dd->dd_compressed_bytes, nice);
+ zdb_nicenum(dd->dd_compressed_bytes, nice);
(void) printf("\t\tcompressed_bytes = %s\n", nice);
- nicenum(dd->dd_uncompressed_bytes, nice);
+ zdb_nicenum(dd->dd_uncompressed_bytes, nice);
(void) printf("\t\tuncompressed_bytes = %s\n", nice);
- nicenum(dd->dd_quota, nice);
+ zdb_nicenum(dd->dd_quota, nice);
(void) printf("\t\tquota = %s\n", nice);
- nicenum(dd->dd_reserved, nice);
+ zdb_nicenum(dd->dd_reserved, nice);
(void) printf("\t\treserved = %s\n", nice);
(void) printf("\t\tprops_zapobj = %llu\n",
(u_longlong_t)dd->dd_props_zapobj);
@@ -778,7 +1061,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
(u_longlong_t)dd->dd_flags);
#define DO(which) \
- nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
+ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
DO(HEAD);
DO(SNAP);
@@ -794,7 +1077,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dataset_phys_t *ds = data;
time_t crtime;
- char used[6], compressed[6], uncompressed[6], unique[6];
+ char used[32], compressed[32], uncompressed[32], unique[32];
char blkbuf[BP_SPRINTF_LEN];
if (ds == NULL)
@@ -802,11 +1085,11 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
ASSERT(size == sizeof (*ds));
crtime = ds->ds_creation_time;
- nicenum(ds->ds_used_bytes, used);
- nicenum(ds->ds_compressed_bytes, compressed);
- nicenum(ds->ds_uncompressed_bytes, uncompressed);
- nicenum(ds->ds_unique_bytes, unique);
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+ zdb_nicenum(ds->ds_used_bytes, used);
+ zdb_nicenum(ds->ds_compressed_bytes, compressed);
+ zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
+ zdb_nicenum(ds->ds_unique_bytes, unique);
+ sprintf_blkptr(blkbuf, &ds->ds_bp);
(void) printf("\t\tdir_obj = %llu\n",
(u_longlong_t)ds->ds_dir_obj);
@@ -820,6 +1103,8 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
(u_longlong_t)ds->ds_snapnames_zapobj);
(void) printf("\t\tnum_children = %llu\n",
(u_longlong_t)ds->ds_num_children);
+ (void) printf("\t\tuserrefs_obj = %llu\n",
+ (u_longlong_t)ds->ds_userrefs_obj);
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
(void) printf("\t\tcreation_txg = %llu\n",
(u_longlong_t)ds->ds_creation_txg);
@@ -842,63 +1127,88 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
(void) printf("\t\tbp = %s\n", blkbuf);
}
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ ASSERT(bp->blk_birth != 0);
+ sprintf_blkptr_compact(blkbuf, bp);
+ (void) printf("\t%s\n", blkbuf);
+ return (0);
+}
+
static void
-dump_bplist(objset_t *mos, uint64_t object, char *name)
+dump_bpobj(bpobj_t *bpo, char *name)
{
- bplist_t bpl = { 0 };
- blkptr_t blk, *bp = &blk;
- uint64_t itor = 0;
- char bytes[6];
- char comp[6];
- char uncomp[6];
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
if (dump_opt['d'] < 3)
return;
- mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
- VERIFY(0 == bplist_open(&bpl, mos, object));
- if (bplist_empty(&bpl)) {
- bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
- return;
- }
-
- nicenum(bpl.bpl_phys->bpl_bytes, bytes);
- if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
- nicenum(bpl.bpl_phys->bpl_comp, comp);
- nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
- (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n",
- name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+ zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+ if (bpo->bpo_havesubobj) {
+ zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+ zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+ (void) printf("\n %s: %llu local blkptrs, %llu subobjs, "
+ "%s (%s/%s comp)\n",
+ name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
bytes, comp, uncomp);
} else {
- (void) printf("\n %s: %llu entries, %s\n",
- name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+ (void) printf("\n %s: %llu blkptrs, %s\n",
+ name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes);
}
- if (dump_opt['d'] < 5) {
- bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
+ if (dump_opt['d'] < 5)
return;
- }
(void) printf("\n");
- while (bplist_iterate(&bpl, &itor, bp) == 0) {
- char blkbuf[BP_SPRINTF_LEN];
+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+}
- ASSERT(bp->blk_birth != 0);
- sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
- (void) printf("\tItem %3llu: %s\n",
- (u_longlong_t)itor - 1, blkbuf);
- }
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+ dsl_deadlist_entry_t *dle;
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ zdb_nicenum(dl->dl_phys->dl_used, bytes);
+ zdb_nicenum(dl->dl_phys->dl_comp, comp);
+ zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+ (void) printf("\n Deadlist: %s (%s/%s comp)\n",
+ bytes, comp, uncomp);
- bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
+ if (dump_opt['d'] < 4)
+ return;
+
+ (void) printf("\n");
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ (void) printf(" mintxg %llu -> obj %llu\n",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+
+ if (dump_opt['d'] >= 5)
+ dump_bpobj(&dle->dle_bpobj, "");
+ }
}
static avl_tree_t idx_tree;
static avl_tree_t domain_tree;
static boolean_t fuid_table_loaded;
+static boolean_t sa_loaded;
+sa_attr_type_t *sa_attr_table;
static void
fuid_table_destroy()
@@ -931,12 +1241,12 @@ print_idstr(uint64_t id, const char *id_type)
}
static void
-dump_uidgid(objset_t *os, znode_phys_t *zp)
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
{
uint32_t uid_idx, gid_idx;
- uid_idx = FUID_INDEX(zp->zp_uid);
- gid_idx = FUID_INDEX(zp->zp_gid);
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
/* Load domain table, if not already loaded */
if (!fuid_table_loaded && (uid_idx || gid_idx)) {
@@ -951,50 +1261,111 @@ dump_uidgid(objset_t *os, znode_phys_t *zp)
fuid_table_loaded = B_TRUE;
}
- print_idstr(zp->zp_uid, "uid");
- print_idstr(zp->zp_gid, "gid");
+ print_idstr(uid, "uid");
+ print_idstr(gid, "gid");
}
/*ARGSUSED*/
static void
dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
{
- znode_phys_t *zp = data;
- time_t z_crtime, z_atime, z_mtime, z_ctime;
char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ sa_handle_t *hdl;
+ uint64_t xattr, rdev, gen;
+ uint64_t uid, gid, mode, fsize, parent, links;
+ uint64_t pflags;
+ uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+ time_t z_crtime, z_atime, z_mtime, z_ctime;
+ sa_bulk_attr_t bulk[12];
+ int idx = 0;
int error;
- ASSERT(size >= sizeof (znode_phys_t));
+ if (!sa_loaded) {
+ uint64_t sa_attrs = 0;
+ uint64_t version;
+
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &version) == 0);
+ if (version >= ZPL_VERSION_SA) {
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+ 8, 1, &sa_attrs) == 0);
+ }
+ if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
+ ZPL_END, &sa_attr_table)) != 0) {
+ (void) printf("sa_setup failed errno %d, can't "
+ "display znode contents\n", error);
+ return;
+ }
+ sa_loaded = B_TRUE;
+ }
+
+ if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+ (void) printf("Failed to get handle for SA znode\n");
+ return;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+ &links, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+ &fsize, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+ acctm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+ modtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+ crtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+ chgtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+ &pflags, 8);
+
+ if (sa_bulk_lookup(hdl, bulk, idx)) {
+ (void) sa_handle_destroy(hdl);
+ return;
+ }
error = zfs_obj_to_path(os, object, path, sizeof (path));
if (error != 0) {
(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
(u_longlong_t)object);
}
-
if (dump_opt['d'] < 3) {
(void) printf("\t%s\n", path);
+ (void) sa_handle_destroy(hdl);
return;
}
- z_crtime = (time_t)zp->zp_crtime[0];
- z_atime = (time_t)zp->zp_atime[0];
- z_mtime = (time_t)zp->zp_mtime[0];
- z_ctime = (time_t)zp->zp_ctime[0];
+ z_crtime = (time_t)crtm[0];
+ z_atime = (time_t)acctm[0];
+ z_mtime = (time_t)modtm[0];
+ z_ctime = (time_t)chgtm[0];
(void) printf("\tpath %s\n", path);
- dump_uidgid(os, zp);
+ dump_uidgid(os, uid, gid);
(void) printf("\tatime %s", ctime(&z_atime));
(void) printf("\tmtime %s", ctime(&z_mtime));
(void) printf("\tctime %s", ctime(&z_ctime));
(void) printf("\tcrtime %s", ctime(&z_crtime));
- (void) printf("\tgen %llu\n", (u_longlong_t)zp->zp_gen);
- (void) printf("\tmode %llo\n", (u_longlong_t)zp->zp_mode);
- (void) printf("\tsize %llu\n", (u_longlong_t)zp->zp_size);
- (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent);
- (void) printf("\tlinks %llu\n", (u_longlong_t)zp->zp_links);
- (void) printf("\txattr %llu\n", (u_longlong_t)zp->zp_xattr);
- (void) printf("\trdev 0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+ (void) printf("\tgen %llu\n", (u_longlong_t)gen);
+ (void) printf("\tmode %llo\n", (u_longlong_t)mode);
+ (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
+ (void) printf("\tparent %llu\n", (u_longlong_t)parent);
+ (void) printf("\tlinks %llu\n", (u_longlong_t)links);
+ (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
+ sa_handle_destroy(hdl);
}
/*ARGSUSED*/
@@ -1009,7 +1380,7 @@ dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
{
}
-static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
dump_none, /* unallocated */
dump_zap, /* object directory */
dump_uint64, /* object array */
@@ -1051,6 +1422,20 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
dump_zap, /* DSL scrub queue */
dump_zap, /* ZFS user/group used */
dump_zap, /* ZFS user/group quota */
+ dump_zap, /* snapshot refcount tags */
+ dump_ddt_zap, /* DDT ZAP object */
+ dump_zap, /* DDT statistics */
+ dump_znode, /* SA object */
+ dump_zap, /* SA Master Node */
+ dump_sa_attrs, /* SA attribute registration */
+ dump_sa_layouts, /* SA attribute layouts */
+ dump_zap, /* DSL scrub translations */
+ dump_none, /* fake dedup BP */
+ dump_zap, /* deadlist */
+ dump_none, /* deadlist hdr */
+ dump_zap, /* dsl clones */
+ dump_none, /* bpobj subobjs */
+ dump_unknown, /* Unknown type, must be last */
};
static void
@@ -1061,18 +1446,20 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
dnode_t *dn;
void *bonus = NULL;
size_t bsize = 0;
- char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+ char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+ char bonus_size[32];
char aux[50];
int error;
if (*print_header) {
- (void) printf("\n Object lvl iblk dblk lsize"
- " asize type\n");
+ (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n",
+ "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+ "%full", "type");
*print_header = 0;
}
if (object == 0) {
- dn = os->os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
} else {
error = dmu_bonus_hold(os, object, FTAG, &db);
if (error)
@@ -1080,50 +1467,55 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
object, error);
bonus = db->db_data;
bsize = db->db_size;
- dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dn = DB_DNODE((dmu_buf_impl_t *)db);
}
dmu_object_info_from_dnode(dn, &doi);
- nicenum(doi.doi_metadata_block_size, iblk);
- nicenum(doi.doi_data_block_size, dblk);
- nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
- lsize);
- nicenum(doi.doi_physical_blks << 9, asize);
- nicenum(doi.doi_bonus_size, bonus_size);
+ zdb_nicenum(doi.doi_metadata_block_size, iblk);
+ zdb_nicenum(doi.doi_data_block_size, dblk);
+ zdb_nicenum(doi.doi_max_offset, lsize);
+ zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
+ zdb_nicenum(doi.doi_bonus_size, bonus_size);
+ (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+ doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+ doi.doi_max_offset);
aux[0] = '\0';
if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
- zio_checksum_table[doi.doi_checksum].ci_name);
+ ZDB_CHECKSUM_NAME(doi.doi_checksum));
}
if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
- zio_compress_table[doi.doi_compress].ci_name);
+ ZDB_COMPRESS_NAME(doi.doi_compress));
}
- (void) printf("%10lld %3u %5s %5s %5s %5s %s%s\n",
- (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
- asize, dmu_ot[doi.doi_type].ot_name, aux);
+ (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+ asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
- (void) printf("%10s %3s %5s %5s %5s %5s %s\n",
- "", "", "", "", bonus_size, "bonus",
- dmu_ot[doi.doi_bonus_type].ot_name);
+ (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n",
+ "", "", "", "", "", bonus_size, "bonus",
+ ZDB_OT_NAME(doi.doi_bonus_type));
}
if (verbosity >= 4) {
- (void) printf("\tdnode flags: %s%s\n",
+ (void) printf("\tdnode flags: %s%s%s\n",
(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
"USED_BYTES " : "",
(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
- "USERUSED_ACCOUNTED " : "");
+ "USERUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+ "SPILL_BLKPTR" : "");
(void) printf("\tdnode maxblkid: %llu\n",
(longlong_t)dn->dn_phys->dn_maxblkid);
- object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
- object_viewer[doi.doi_type](os, object, NULL, 0);
+ object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
+ bonus, bsize);
+ object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
*print_header = 1;
}
@@ -1145,6 +1537,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
for (;;) {
+ char segsize[32];
error = dnode_next_offset(dn,
0, &start, minlvl, blkfill, 0);
if (error)
@@ -1152,7 +1545,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
end = start;
error = dnode_next_offset(dn,
DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
- nicenum(end - start, segsize);
+ zdb_nicenum(end - start, segsize);
(void) printf("\t\tsegment [%016llx, %016llx)"
" size %5s\n", (u_longlong_t)start,
(u_longlong_t)end, segsize);
@@ -1175,7 +1568,7 @@ dump_dir(objset_t *os)
dmu_objset_stats_t dds;
uint64_t object, object_count;
uint64_t refdbytes, usedobjs, scratch;
- char numbuf[8];
+ char numbuf[32];
char blkbuf[BP_SPRINTF_LEN + 20];
char osname[MAXNAMELEN];
char *type = "UNKNOWN";
@@ -1190,21 +1583,20 @@ dump_dir(objset_t *os)
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
- usedobjs = os->os->os_rootbp->blk_fill;
- refdbytes = os->os->os_spa->spa_dsl_pool->
+ usedobjs = os->os_rootbp->blk_fill;
+ refdbytes = os->os_spa->spa_dsl_pool->
dp_mos_dir->dd_phys->dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
- ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
+ ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
- nicenum(refdbytes, numbuf);
+ zdb_nicenum(refdbytes, numbuf);
if (verbosity >= 4) {
- (void) sprintf(blkbuf + strlen(blkbuf), ", rootbp ");
- (void) sprintf_blkptr(blkbuf + strlen(blkbuf),
- BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
+ (void) sprintf(blkbuf, ", rootbp ");
+ (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
} else {
blkbuf[0] = '\0';
}
@@ -1217,18 +1609,6 @@ dump_dir(objset_t *os)
(u_longlong_t)dds.dds_creation_txg,
numbuf, (u_longlong_t)usedobjs, blkbuf);
- dump_intent_log(dmu_objset_zil(os));
-
- if (dmu_objset_ds(os) != NULL)
- dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
- dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
-
- if (verbosity < 2)
- return;
-
- if (os->os->os_rootbp->blk_birth == 0)
- return;
-
if (zopt_objects != 0) {
for (i = 0; i < zopt_objects; i++)
dump_object(os, zopt_object[i], verbosity,
@@ -1237,10 +1617,22 @@ dump_dir(objset_t *os)
return;
}
+ if (dump_opt['i'] != 0 || verbosity >= 2)
+ dump_intent_log(dmu_objset_zil(os));
+
+ if (dmu_objset_ds(os) != NULL)
+ dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
+
+ if (verbosity < 2)
+ return;
+
+ if (os->os_rootbp->blk_birth == 0)
+ return;
+
dump_object(os, 0, verbosity, &print_header);
object_count = 0;
- if (os->os->os_userused_dnode &&
- os->os->os_userused_dnode->dn_type != 0) {
+ if (DMU_USERUSED_DNODE(os) != NULL &&
+ DMU_USERUSED_DNODE(os)->dn_type != 0) {
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
}
@@ -1262,11 +1654,11 @@ dump_dir(objset_t *os)
}
static void
-dump_uberblock(uberblock_t *ub)
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
{
time_t timestamp = ub->ub_timestamp;
- (void) printf("Uberblock\n\n");
+ (void) printf(header ? header : "");
(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
@@ -1275,25 +1667,34 @@ dump_uberblock(uberblock_t *ub)
(u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
if (dump_opt['u'] >= 3) {
char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+ sprintf_blkptr(blkbuf, &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
- (void) printf("\n");
+ (void) printf(footer ? footer : "");
}
static void
-dump_config(const char *pool)
+dump_config(spa_t *spa)
{
- spa_t *spa = NULL;
+ dmu_buf_t *db;
+ size_t nvsize = 0;
+ int error = 0;
+
+
+ error = dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object, FTAG, &db);
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (pool == NULL)
- (void) printf("%s\n", spa_name(spa));
- if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
- dump_nvlist(spa->spa_config, 4);
+ if (error == 0) {
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ (void) printf("\nMOS Configuration:\n");
+ dump_packed_nvlist(spa->spa_meta_objset,
+ spa->spa_config_object, (void *)&nvsize, 1);
+ } else {
+ (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+ (u_longlong_t)spa->spa_config_object, error);
}
- mutex_exit(&spa_namespace_lock);
}
static void
@@ -1342,41 +1743,75 @@ dump_cachefile(const char *cachefile)
nvlist_free(config);
}
+#define ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+{
+ vdev_t vd;
+ vdev_t *vdp = &vd;
+ char header[ZDB_MAX_UB_HEADER_SIZE];
+
+ vd.vdev_ashift = ashift;
+ vdp->vdev_top = vdp;
+
+ for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+ uberblock_t *ub = (void *)((char *)lbl + uoff);
+
+ if (uberblock_verify(ub))
+ continue;
+ (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+ "Uberblock[%d]\n", i);
+ dump_uberblock(ub, header, "");
+ }
+}
+
static void
dump_label(const char *dev)
{
int fd;
vdev_label_t label;
- char *buf = label.vl_vdev_phys.vp_nvlist;
+ char *path, *buf = label.vl_vdev_phys.vp_nvlist;
size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
struct stat64 statbuf;
- uint64_t psize;
- int l;
+ uint64_t psize, ashift;
+ int len = strlen(dev) + 1;
+
+ if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+ len++;
+ path = malloc(len);
+ (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+ } else {
+ path = strdup(dev);
+ }
- if ((fd = open64(dev, O_RDONLY)) < 0) {
- (void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+ if ((fd = open64(path, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+ free(path);
exit(1);
}
if (fstat64(fd, &statbuf) != 0) {
- (void) printf("failed to stat '%s': %s\n", dev,
+ (void) printf("failed to stat '%s': %s\n", path,
strerror(errno));
+ free(path);
+ (void) close(fd);
exit(1);
}
- if (S_ISCHR(statbuf.st_mode)) {
- if (ioctl(fd, DIOCGMEDIASIZE, &statbuf.st_size) == -1) {
- (void) printf("failed to get size of '%s': %s\n", dev,
- strerror(errno));
- exit(1);
- }
+ if (S_ISBLK(statbuf.st_mode)) {
+ (void) printf("cannot use '%s': character device required\n",
+ path);
+ free(path);
+ (void) close(fd);
+ exit(1);
}
psize = statbuf.st_size;
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
- for (l = 0; l < VDEV_LABELS; l++) {
-
+ for (int l = 0; l < VDEV_LABELS; l++) {
nvlist_t *config = NULL;
(void) printf("--------------------------------------------\n");
@@ -1391,105 +1826,47 @@ dump_label(const char *dev)
if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
(void) printf("failed to unpack label %d\n", l);
- continue;
+ ashift = SPA_MINBLOCKSHIFT;
+ } else {
+ nvlist_t *vdev_tree = NULL;
+
+ dump_nvlist(config, 4);
+ if ((nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+ (nvlist_lookup_uint64(vdev_tree,
+ ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+ ashift = SPA_MINBLOCKSHIFT;
+ nvlist_free(config);
}
- dump_nvlist(config, 4);
- nvlist_free(config);
+ if (dump_opt['u'])
+ dump_label_uberblocks(&label, ashift);
}
+
+ free(path);
+ (void) close(fd);
}
/*ARGSUSED*/
static int
-dump_one_dir(char *dsname, void *arg)
+dump_one_dir(const char *dsname, void *arg)
{
int error;
objset_t *os;
- error = dmu_objset_open(dsname, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os);
+ error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
if (error) {
- (void) printf("Could not open %s\n", dsname);
+ (void) printf("Could not open %s, error %d\n", dsname, error);
return (0);
}
dump_dir(os);
- dmu_objset_close(os);
+ dmu_objset_disown(os, FTAG);
fuid_table_destroy();
+ sa_loaded = B_FALSE;
return (0);
}
-static void
-zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
-{
- vdev_t *vd = sm->sm_ppd;
-
- (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
- (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_load(space_map_t *sm)
-{
-}
-
-static void
-zdb_space_map_unload(space_map_t *sm)
-{
- space_map_vacate(sm, zdb_leak, sm);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-}
-
-static space_map_ops_t zdb_space_map_ops = {
- zdb_space_map_load,
- zdb_space_map_unload,
- NULL, /* alloc */
- zdb_space_map_claim,
- NULL, /* free */
- NULL /* maxsize */
-};
-
-static void
-zdb_leak_init(spa_t *spa)
-{
- vdev_t *rvd = spa->spa_root_vdev;
-
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- mutex_enter(&msp->ms_lock);
- VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops,
- SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0);
- msp->ms_map.sm_ppd = vd;
- mutex_exit(&msp->ms_lock);
- }
- }
-}
-
-static void
-zdb_leak_fini(spa_t *spa)
-{
- vdev_t *rvd = spa->spa_root_vdev;
-
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- mutex_enter(&msp->ms_lock);
- space_map_unload(&msp->ms_map);
- mutex_exit(&msp->ms_lock);
- }
- }
-}
-
/*
- * Verify that the sum of the sizes of all blocks in the pool adds up
- * to the SPA's sa_alloc total.
+ * Block statistics.
*/
typedef struct zdb_blkstats {
uint64_t zb_asize;
@@ -1498,24 +1875,45 @@ typedef struct zdb_blkstats {
uint64_t zb_count;
} zdb_blkstats_t;
-#define DMU_OT_DEFERRED DMU_OT_NONE
-#define DMU_OT_TOTAL DMU_OT_NUMTYPES
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
+#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
+#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2)
+
+static char *zdb_ot_extname[] = {
+ "deferred free",
+ "dedup ditto",
+ "Total",
+};
#define ZB_TOTAL DN_MAX_LEVELS
typedef struct zdb_cb {
- zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
+ zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+ uint64_t zcb_dedup_asize;
+ uint64_t zcb_dedup_blocks;
uint64_t zcb_errors[256];
int zcb_readfails;
int zcb_haderrors;
+ spa_t *zcb_spa;
} zdb_cb_t;
static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+ dmu_object_type_t type)
{
+ uint64_t refcnt = 0;
+
+ ASSERT(type < ZDB_OT_TOTAL);
+
+ if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ return;
+
for (int i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
- int t = (i & 1) ? type : DMU_OT_TOTAL;
+ int t = (i & 1) ? type : ZDB_OT_TOTAL;
zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
zb->zb_asize += BP_GET_ASIZE(bp);
@@ -1524,127 +1922,258 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
zb->zb_count++;
}
- if (dump_opt['S']) {
- boolean_t print_sig;
-
- print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
- BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
-
- if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
- print_sig = B_FALSE;
-
- if (print_sig) {
- (void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t"
- "%llx:%llx:%llx:%llx\n",
- (u_longlong_t)BP_GET_LEVEL(bp),
- (longlong_t)BP_GET_PSIZE(bp),
- (longlong_t)BP_GET_NDVAS(bp),
- dmu_ot[BP_GET_TYPE(bp)].ot_name,
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ if (dump_opt['L'])
+ return;
+
+ if (BP_GET_DEDUP(bp)) {
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+
+ ddt = ddt_select(zcb->zcb_spa, bp);
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_FALSE);
+
+ if (dde == NULL) {
+ refcnt = 0;
+ } else {
+ ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ refcnt = ddp->ddp_refcnt;
+ if (ddt_phys_total_refcnt(dde) == 0)
+ ddt_remove(ddt, dde);
}
+ ddt_exit(ddt);
}
- if (!dump_opt['L'])
- VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
- NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
+ VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+ refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+ bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
+/* ARGSUSED */
static int
-zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
zdb_cb_t *zcb = arg;
char blkbuf[BP_SPRINTF_LEN];
dmu_object_type_t type;
- boolean_t is_l0_metadata;
+ boolean_t is_metadata;
if (bp == NULL)
return (0);
type = BP_GET_TYPE(bp);
- zdb_count_block(spa, zcb, bp, type);
+ zdb_count_block(zcb, zilog, bp, type);
- /*
- * if we do metadata-only checksumming there's no need to checksum
- * indirect blocks here because it is done during traverse
- */
- is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES &&
- dmu_ot[type].ot_metadata);
+ is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+
+ if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+ int ioerr;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = malloc(size);
+ int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
- if (dump_opt['c'] > 1 || dump_opt['S'] ||
- (dump_opt['c'] && is_l0_metadata)) {
- int ioerr, size;
- void *data;
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ flags |= ZIO_FLAG_SPECULATIVE;
- size = BP_GET_LSIZE(bp);
- data = malloc(size);
ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb));
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+
free(data);
- /* We expect io errors on intent log */
- if (ioerr && type != DMU_OT_INTENT_LOG) {
+ if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
zcb->zcb_haderrors = 1;
zcb->zcb_errors[ioerr]++;
if (dump_opt['b'] >= 2)
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ sprintf_blkptr(blkbuf, bp);
else
blkbuf[0] = '\0';
- if (!dump_opt['S']) {
- (void) printf("zdb_blkptr_cb: "
- "Got error %d reading "
- "<%llu, %llu, %lld, %llx> %s -- skipping\n",
- ioerr,
- (u_longlong_t)zb->zb_objset,
- (u_longlong_t)zb->zb_object,
- (u_longlong_t)zb->zb_level,
- (u_longlong_t)zb->zb_blkid,
- blkbuf);
- }
+ (void) printf("zdb_blkptr_cb: "
+ "Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+ ioerr,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf);
}
}
zcb->zcb_readfails = 0;
if (dump_opt['b'] >= 4) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
- (void) printf("objset %llu object %llu offset 0x%llx %s\n",
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("objset %llu object %llu "
+ "level %lld offset 0x%llx %s\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
- (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
+ (longlong_t)zb->zb_level,
+ (u_longlong_t)blkid2offset(dnp, bp, zb),
blkbuf);
}
return (0);
}
+static void
+zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = sm->sm_ppd;
+
+ (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_load(space_map_t *sm)
+{
+}
+
+static void
+zdb_space_map_unload(space_map_t *sm)
+{
+ space_map_vacate(sm, zdb_leak, sm);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+}
+
+static space_map_ops_t zdb_space_map_ops = {
+ zdb_space_map_load,
+ zdb_space_map_unload,
+ NULL, /* alloc */
+ zdb_space_map_claim,
+ NULL, /* free */
+ NULL /* maxsize */
+};
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ ddt_bookmark_t ddb = { 0 };
+ ddt_entry_t dde;
+ int error;
+
+ while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+ blkptr_t blk;
+ ddt_phys_t *ddp = dde.dde_phys;
+
+ if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ return;
+
+ ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddb.ddb_checksum,
+ &dde.dde_key, ddp, &blk);
+ if (p == DDT_PHYS_DITTO) {
+ zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+ } else {
+ zcb->zcb_dedup_asize +=
+ BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ zcb->zcb_dedup_blocks++;
+ }
+ }
+ if (!dump_opt['L']) {
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
+ }
+ }
+
+ ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ zcb->zcb_spa = spa;
+
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ VERIFY(space_map_load(&msp->ms_map,
+ &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+ spa->spa_meta_objset) == 0);
+ msp->ms_map.sm_ppd = vd;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ zdb_ddt_leak_init(spa, zcb);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+zdb_leak_fini(spa_t *spa)
+{
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+ }
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+
+ if (dump_opt['b'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ return (0);
+}
+
static int
dump_block_stats(spa_t *spa)
{
zdb_cb_t zcb = { 0 };
zdb_blkstats_t *zb, *tzb;
- uint64_t alloc, space, logalloc;
- vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
int leaks = 0;
- int c, e;
- if (!dump_opt['S']) {
- (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
- (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
- (dump_opt['c'] == 1) ? "metadata " : "",
- dump_opt['c'] ? "checksums " : "",
- (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
- !dump_opt['L'] ? "nothing leaked " : "");
- }
+ (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+ (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ (dump_opt['c'] == 1) ? "metadata " : "",
+ dump_opt['c'] ? "checksums " : "",
+ (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ !dump_opt['L'] ? "nothing leaked " : "");
/*
* Load all space maps as SM_ALLOC maps, then traverse the pool
@@ -1654,39 +2183,25 @@ dump_block_stats(spa_t *spa)
* it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block.
*/
- if (!dump_opt['L'])
- zdb_leak_init(spa);
+ zdb_leak_init(spa, &zcb);
/*
* If there's a deferred-free bplist, process that first.
*/
- if (spa->spa_sync_bplist_obj != 0) {
- bplist_t *bpl = &spa->spa_sync_bplist;
- blkptr_t blk;
- uint64_t itor = 0;
+ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ count_block_cb, &zcb, NULL);
+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ count_block_cb, &zcb, NULL);
- VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
- spa->spa_sync_bplist_obj));
+ if (dump_opt['c'] > 1)
+ flags |= TRAVERSE_PREFETCH_DATA;
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- if (dump_opt['b'] >= 4) {
- char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
- (void) printf("[%s] %s\n",
- "deferred free", blkbuf);
- }
- zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
- }
-
- bplist_close(bpl);
- }
-
- zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+ zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
- if (zcb.zcb_haderrors && !dump_opt['S']) {
+ if (zcb.zcb_haderrors) {
(void) printf("\nError counts:\n\n");
(void) printf("\t%5s %s\n", "errno", "count");
- for (e = 0; e < 256; e++) {
+ for (int e = 0; e < 256; e++) {
if (zcb.zcb_errors[e] != 0) {
(void) printf("\t%5d %llu\n",
e, (u_longlong_t)zcb.zcb_errors[e]);
@@ -1697,43 +2212,27 @@ dump_block_stats(spa_t *spa)
/*
* Report any leaked segments.
*/
- if (!dump_opt['L'])
- zdb_leak_fini(spa);
+ zdb_leak_fini(spa);
- /*
- * If we're interested in printing out the blkptr signatures,
- * return now as we don't print out anything else (including
- * errors and leaks).
- */
- if (dump_opt['S'])
- return (zcb.zcb_haderrors ? 3 : 0);
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
-
- /*
- * Log blocks allocated from a separate log device don't count
- * as part of the normal pool space; factor them in here.
- */
- logalloc = 0;
+ tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
- for (c = 0; c < rvd->vdev_children; c++)
- if (rvd->vdev_child[c]->vdev_islog)
- logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc;
+ norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ norm_space = metaslab_class_get_space(spa_normal_class(spa));
- tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
+ total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
- if (tzb->zb_asize == alloc + logalloc) {
+ if (total_found == total_alloc) {
if (!dump_opt['L'])
(void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
- (u_longlong_t)tzb->zb_asize,
- (u_longlong_t)alloc + logalloc,
+ (u_longlong_t)total_found,
+ (u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
- (longlong_t)(alloc + logalloc - tzb->zb_asize));
+ (longlong_t)(total_alloc - total_found));
leaks = 1;
}
@@ -1743,33 +2242,41 @@ dump_block_stats(spa_t *spa)
(void) printf("\n");
(void) printf("\tbp count: %10llu\n",
(u_longlong_t)tzb->zb_count);
- (void) printf("\tbp logical: %10llu\t avg: %6llu\n",
+ (void) printf("\tbp logical: %10llu avg: %6llu\n",
(u_longlong_t)tzb->zb_lsize,
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
- (void) printf("\tbp physical: %10llu\t avg:"
- " %6llu\tcompression: %6.2f\n",
+ (void) printf("\tbp physical: %10llu avg:"
+ " %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_psize,
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_psize);
- (void) printf("\tbp allocated: %10llu\t avg:"
- " %6llu\tcompression: %6.2f\n",
+ (void) printf("\tbp allocated: %10llu avg:"
+ " %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_asize,
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_asize);
- (void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
- (u_longlong_t)alloc, 100.0 * alloc / space);
+ (void) printf("\tbp deduped: %10llu ref>1:"
+ " %6llu deduplication: %6.2f\n",
+ (u_longlong_t)zcb.zcb_dedup_asize,
+ (u_longlong_t)zcb.zcb_dedup_blocks,
+ (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+ (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
+ (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
if (dump_opt['b'] >= 2) {
int l, t, level;
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
"\t avg\t comp\t%%Total\tType\n");
- for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
- char csize[6], lsize[6], psize[6], asize[6], avg[6];
+ for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+ char csize[32], lsize[32], psize[32], asize[32];
+ char avg[32];
char *typename;
- typename = t == DMU_OT_DEFERRED ? "deferred free" :
- t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+ if (t < DMU_OT_NUMTYPES)
+ typename = dmu_ot[t].ot_name;
+ else
+ typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
(void) printf("%6s\t%5s\t%5s\t%5s"
@@ -1799,11 +2306,11 @@ dump_block_stats(spa_t *spa)
zcb.zcb_type[ZB_TOTAL][t].zb_asize)
continue;
- nicenum(zb->zb_count, csize);
- nicenum(zb->zb_lsize, lsize);
- nicenum(zb->zb_psize, psize);
- nicenum(zb->zb_asize, asize);
- nicenum(zb->zb_asize / zb->zb_count, avg);
+ zdb_nicenum(zb->zb_count, csize);
+ zdb_nicenum(zb->zb_lsize, lsize);
+ zdb_nicenum(zb->zb_psize, psize);
+ zdb_nicenum(zb->zb_asize, asize);
+ zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
"\t%5.2f\t%6.2f\t",
@@ -1831,36 +2338,157 @@ dump_block_stats(spa_t *spa)
return (0);
}
+typedef struct zdb_ddt_entry {
+ ddt_key_t zdde_key;
+ uint64_t zdde_ref_blocks;
+ uint64_t zdde_ref_lsize;
+ uint64_t zdde_ref_psize;
+ uint64_t zdde_ref_dsize;
+ avl_node_t zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ avl_tree_t *t = arg;
+ avl_index_t where;
+ zdb_ddt_entry_t *zdde, zdde_search;
+
+ if (bp == NULL)
+ return (0);
+
+ if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+ (void) printf("traversing objset %llu, %llu objects, "
+ "%lu blocks so far\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)bp->blk_fill,
+ avl_numnodes(t));
+ }
+
+ if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+ BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+ return (0);
+
+ ddt_key_fill(&zdde_search.zdde_key, bp);
+
+ zdde = avl_find(t, &zdde_search, &where);
+
+ if (zdde == NULL) {
+ zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+ zdde->zdde_key = zdde_search.zdde_key;
+ avl_insert(t, zdde, where);
+ }
+
+ zdde->zdde_ref_blocks += 1;
+ zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+ zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+ zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+ return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+ avl_tree_t t;
+ void *cookie = NULL;
+ zdb_ddt_entry_t *zdde;
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ avl_create(&t, ddt_entry_compare,
+ sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ zdb_ddt_add_cb, &t);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+ ddt_stat_t dds;
+ uint64_t refcnt = zdde->zdde_ref_blocks;
+ ASSERT(refcnt != 0);
+
+ dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+ dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+ dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+ dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+ dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+ dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+ dds.dds_ref_psize = zdde->zdde_ref_psize;
+ dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+ ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+
+ umem_free(zdde, sizeof (*zdde));
+ }
+
+ avl_destroy(&t);
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ (void) printf("Simulated DDT histogram:\n");
+
+ zpool_dump_ddt(&dds_total, &ddh_total);
+
+ dump_dedup_ratio(&dds_total);
+}
+
static void
dump_zpool(spa_t *spa)
{
dsl_pool_t *dp = spa_get_dsl(spa);
int rc = 0;
+ if (dump_opt['S']) {
+ dump_simulated_ddt(spa);
+ return;
+ }
+
+ if (!dump_opt['e'] && dump_opt['C'] > 1) {
+ (void) printf("\nCached configuration:\n");
+ dump_nvlist(spa->spa_config, 8);
+ }
+
+ if (dump_opt['C'])
+ dump_config(spa);
+
if (dump_opt['u'])
- dump_uberblock(&spa->spa_uberblock);
+ dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
- if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
+ if (dump_opt['D'])
+ dump_all_ddts(spa);
+
+ if (dump_opt['d'] > 2 || dump_opt['m'])
+ dump_metaslabs(spa);
+
+ if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
- dump_bplist(dp->dp_meta_objset,
- spa->spa_sync_bplist_obj, "Deferred frees");
+ dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+ "Pool frees");
+ }
dump_dtl(spa->spa_root_vdev, 0);
}
-
- if (dump_opt['d'] >= 3 || dump_opt['m'])
- dump_metaslabs(spa);
-
- (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
- DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ (void) dmu_objset_find(spa_name(spa), dump_one_dir,
+ NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
}
-
- if (dump_opt['b'] || dump_opt['c'] || dump_opt['S'])
+ if (dump_opt['b'] || dump_opt['c'])
rc = dump_block_stats(spa);
if (dump_opt['s'])
show_pool_stats(spa);
+ if (dump_opt['h'])
+ dump_history(spa);
+
if (rc != 0)
exit(rc);
}
@@ -1879,51 +2507,13 @@ int flagbits[256];
static void
zdb_print_blkptr(blkptr_t *bp, int flags)
{
- dva_t *dva = bp->blk_dva;
- int d;
+ char blkbuf[BP_SPRINTF_LEN];
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
- /*
- * Super-ick warning: This code is also duplicated in
- * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code
- * replication, too.
- */
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
- (longlong_t)DVA_GET_VDEV(&dva[d]),
- (longlong_t)DVA_GET_OFFSET(&dva[d]));
- (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t"
- "ASIZE: %llx\n", d,
- DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
- (longlong_t)DVA_GET_GRID(&dva[d]),
- (longlong_t)DVA_GET_ASIZE(&dva[d]));
- (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
- (u_longlong_t)DVA_GET_VDEV(&dva[d]),
- (longlong_t)DVA_GET_OFFSET(&dva[d]),
- (longlong_t)BP_GET_PSIZE(bp),
- BP_SHOULD_BYTESWAP(bp) ? "e" : "",
- !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
- "d" : "",
- DVA_GET_GANG(&dva[d]) ? "g" : "",
- BP_GET_COMPRESS(bp) != 0 ? "d" : "");
- }
- (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n",
- (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
- (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
- BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
- dmu_ot[BP_GET_TYPE(bp)].ot_name);
- (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n",
- (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
- (u_longlong_t)bp->blk_fill);
- (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n",
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
- (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n",
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("%s\n", blkbuf);
}
static void
@@ -1946,7 +2536,7 @@ zdb_dump_block_raw(void *buf, uint64_t size, int flags)
{
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array(buf, size);
- (void) write(2, buf, size);
+ (void) write(1, buf, size);
}
static void
@@ -2049,31 +2639,30 @@ name:
* flags - A string of characters specifying options
* b: Decode a blkptr at given offset within block
* *c: Calculate and display checksums
- * *d: Decompress data before dumping
+ * d: Decompress data before dumping
* e: Byteswap data before dumping
- * *g: Display data as a gang block header
- * *i: Display as an indirect block
+ * g: Display data as a gang block header
+ * i: Display as an indirect block
* p: Do I/O to physical offset
* r: Dump raw data to stdout
*
* * = not yet implemented
*/
static void
-zdb_read_block(char *thing, spa_t **spap)
+zdb_read_block(char *thing, spa_t *spa)
{
- spa_t *spa = *spap;
+ blkptr_t blk, *bp = &blk;
+ dva_t *dva = bp->blk_dva;
int flags = 0;
- uint64_t offset = 0, size = 0, blkptr_offset = 0;
+ uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
- void *buf;
- char *s, *p, *dup, *pool, *vdev, *flagstr;
- int i, error, zio_flags;
+ void *pbuf, *lbuf, *buf;
+ char *s, *p, *dup, *vdev, *flagstr;
+ int i, error;
dup = strdup(thing);
s = strtok(dup, ":");
- pool = s ? s : "";
- s = strtok(NULL, ":");
vdev = s ? s : "";
s = strtok(NULL, ":");
offset = strtoull(s ? s : "", NULL, 16);
@@ -2107,7 +2696,7 @@ zdb_read_block(char *thing, spa_t **spap)
flags |= bit;
/* If it's not something with an argument, keep going */
- if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+ if ((bit & (ZDB_FLAG_CHECKSUM |
ZDB_FLAG_PRINT_BLKPTR)) == 0)
continue;
@@ -2122,16 +2711,6 @@ zdb_read_block(char *thing, spa_t **spap)
}
}
- if (spa == NULL || strcmp(spa_name(spa), pool) != 0) {
- if (spa)
- spa_close(spa, (void *)zdb_read_block);
- error = spa_open(pool, spap, (void *)zdb_read_block);
- if (error)
- fatal("Failed to open pool '%s': %s",
- pool, strerror(error));
- spa = *spap;
- }
-
vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
if (vd == NULL) {
(void) printf("***Invalid vdev: %s\n", vdev);
@@ -2139,22 +2718,58 @@ zdb_read_block(char *thing, spa_t **spap)
return;
} else {
if (vd->vdev_path)
- (void) printf("Found vdev: %s\n", vd->vdev_path);
+ (void) fprintf(stderr, "Found vdev: %s\n",
+ vd->vdev_path);
else
- (void) printf("Found vdev type: %s\n",
+ (void) fprintf(stderr, "Found vdev type: %s\n",
vd->vdev_ops->vdev_op_type);
}
- buf = umem_alloc(size, UMEM_NOFAIL);
+ psize = size;
+ lsize = size;
+
+ pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
- ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY;
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[0], offset);
+ DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
zio = zio_root(spa, NULL, NULL, 0);
- /* XXX todo - cons up a BP so RAID-Z will be happy */
- zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
- ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+
+ if (vd == vd->vdev_top) {
+ /*
+ * Treat this as a normal block read.
+ */
+ zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+ } else {
+ /*
+ * Treat this as a vdev child I/O.
+ */
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+ ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+ }
+
error = zio_wait(zio);
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -2163,6 +2778,52 @@ zdb_read_block(char *thing, spa_t **spap)
goto out;
}
+ if (flags & ZDB_FLAG_DECOMPRESS) {
+ /*
+ * We don't know how the data was compressed, so just try
+ * every decompress function at every inflated blocksize.
+ */
+ enum zio_compress c;
+ void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+ bcopy(pbuf, pbuf2, psize);
+
+ VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+ SPA_MAXBLOCKSIZE - psize) == 0);
+
+ VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+ SPA_MAXBLOCKSIZE - psize) == 0);
+
+ for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+ lsize -= SPA_MINBLOCKSIZE) {
+ for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+ if (zio_decompress_data(c, pbuf, lbuf,
+ psize, lsize) == 0 &&
+ zio_decompress_data(c, pbuf2, lbuf2,
+ psize, lsize) == 0 &&
+ bcmp(lbuf, lbuf2, lsize) == 0)
+ break;
+ }
+ if (c != ZIO_COMPRESS_FUNCTIONS)
+ break;
+ lsize -= SPA_MINBLOCKSIZE;
+ }
+
+ umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+ umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+ if (lsize <= psize) {
+ (void) printf("Decompress of %s failed\n", thing);
+ goto out;
+ }
+ buf = lbuf;
+ size = lsize;
+ } else {
+ buf = pbuf;
+ size = psize;
+ }
+
if (flags & ZDB_FLAG_PRINT_BLKPTR)
zdb_print_blkptr((blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
@@ -2177,134 +2838,92 @@ zdb_read_block(char *thing, spa_t **spap)
zdb_dump_block(thing, buf, size, flags);
out:
- umem_free(buf, size);
+ umem_free(pbuf, SPA_MAXBLOCKSIZE);
+ umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
static boolean_t
-nvlist_string_match(nvlist_t *config, char *name, char *tgt)
+pool_match(nvlist_t *cfg, char *tgt)
{
+ uint64_t v, guid = strtoull(tgt, NULL, 0);
char *s;
- if (nvlist_lookup_string(config, name, &s) != 0)
- return (B_FALSE);
-
- return (strcmp(s, tgt) == 0);
-}
-
-static boolean_t
-nvlist_uint64_match(nvlist_t *config, char *name, uint64_t tgt)
-{
- uint64_t val;
-
- if (nvlist_lookup_uint64(config, name, &val) != 0)
- return (B_FALSE);
-
- return (val == tgt);
-}
-
-static boolean_t
-vdev_child_guid_match(nvlist_t *vdev, uint64_t guid)
-{
- nvlist_t **child;
- uint_t c, children;
-
- verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
- for (c = 0; c < children; ++c)
- if (nvlist_uint64_match(child[c], ZPOOL_CONFIG_GUID, guid))
- return (B_TRUE);
- return (B_FALSE);
-}
-
-static boolean_t
-vdev_child_string_match(nvlist_t *vdev, char *tgt)
-{
- nvlist_t **child;
- uint_t c, children;
-
- verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
- for (c = 0; c < children; ++c) {
- if (nvlist_string_match(child[c], ZPOOL_CONFIG_PATH, tgt) ||
- nvlist_string_match(child[c], ZPOOL_CONFIG_DEVID, tgt))
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-static boolean_t
-vdev_guid_match(nvlist_t *config, uint64_t guid)
-{
- nvlist_t *nvroot;
-
- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
-
- return (nvlist_uint64_match(nvroot, ZPOOL_CONFIG_GUID, guid) ||
- vdev_child_guid_match(nvroot, guid));
-}
-
-static boolean_t
-vdev_string_match(nvlist_t *config, char *tgt)
-{
- nvlist_t *nvroot;
-
- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
-
- return (vdev_child_string_match(nvroot, tgt));
-}
-
-static boolean_t
-pool_match(nvlist_t *config, char *tgt)
-{
- uint64_t guid = strtoull(tgt, NULL, 0);
-
if (guid != 0) {
- return (
- nvlist_uint64_match(config, ZPOOL_CONFIG_POOL_GUID, guid) ||
- vdev_guid_match(config, guid));
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+ return (v == guid);
} else {
- return (
- nvlist_string_match(config, ZPOOL_CONFIG_POOL_NAME, tgt) ||
- vdev_string_match(config, tgt));
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+ return (strcmp(s, tgt) == 0);
}
+ return (B_FALSE);
}
-static int
-find_exported_zpool(char *pool_id, nvlist_t **configp, char *vdev_dir)
+static char *
+find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
{
nvlist_t *pools;
- int error = ENOENT;
nvlist_t *match = NULL;
+ char *name = NULL;
+ char *sepp = NULL;
+ char sep;
+ int count = 0;
+ importargs_t args = { 0 };
- if (vdev_dir != NULL)
- pools = zpool_find_import_activeok(g_zfs, 1, &vdev_dir);
- else
- pools = zpool_find_import_activeok(g_zfs, 0, NULL);
+ args.paths = dirc;
+ args.path = dirv;
+ args.can_be_active = B_TRUE;
+
+ if ((sepp = strpbrk(*target, "/@")) != NULL) {
+ sep = *sepp;
+ *sepp = '\0';
+ }
+
+ pools = zpool_search_import(g_zfs, &args);
if (pools != NULL) {
nvpair_t *elem = NULL;
-
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
verify(nvpair_value_nvlist(elem, configp) == 0);
- if (pool_match(*configp, pool_id)) {
+ if (pool_match(*configp, *target)) {
+ count++;
if (match != NULL) {
- (void) fatal(
- "More than one matching pool - "
- "specify guid/devid/device path.");
+ /* print previously found config */
+ if (name != NULL) {
+ (void) printf("%s\n", name);
+ dump_nvlist(match, 8);
+ name = NULL;
+ }
+ (void) printf("%s\n",
+ nvpair_name(elem));
+ dump_nvlist(*configp, 8);
} else {
match = *configp;
- error = 0;
+ name = nvpair_name(elem);
}
}
}
}
+ if (count > 1)
+ (void) fatal("\tMatched %d pools - use pool GUID "
+ "instead of pool name or \n"
+ "\tpool name part of a dataset name to select pool", count);
- *configp = error ? NULL : match;
+ if (sepp)
+ *sepp = sep;
+ /*
+ * If pool GUID was specified for pool id, replace it with pool name
+ */
+ if (name && (strstr(*target, name) != *target)) {
+ int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
+
+ *target = umem_alloc(sz, UMEM_NOFAIL);
+ (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
+ }
- return (error);
+ *configp = name ? match : NULL;
+
+ return (name);
}
int
@@ -2312,83 +2931,85 @@ main(int argc, char **argv)
{
int i, c;
struct rlimit rl = { 1024, 1024 };
- spa_t *spa;
+ spa_t *spa = NULL;
objset_t *os = NULL;
- char *endstr;
int dump_all = 1;
int verbose = 0;
- int error;
- int exported = 0;
- char *vdev_dir = NULL;
+ int error = 0;
+ char **searchdirs = NULL;
+ int nsearch = 0;
+ char *target;
+ nvlist_t *policy = NULL;
+ uint64_t max_txg = UINT64_MAX;
+ int rewind = ZPOOL_NEVER_REWIND;
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
+ while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) {
switch (c) {
- case 'u':
- case 'd':
- case 'i':
case 'b':
case 'c':
+ case 'd':
+ case 'h':
+ case 'i':
+ case 'l':
case 'm':
case 's':
+ case 'u':
case 'C':
- case 'l':
+ case 'D':
case 'R':
+ case 'S':
dump_opt[c]++;
dump_all = 0;
break;
+ case 'A':
+ case 'F':
case 'L':
+ case 'X':
+ case 'e':
+ case 'P':
dump_opt[c]++;
break;
case 'v':
verbose++;
break;
- case 'U':
- spa_config_path = optarg;
- break;
- case 'e':
- exported = 1;
- break;
case 'p':
- vdev_dir = optarg;
- break;
- case 'S':
- dump_opt[c]++;
- dump_all = 0;
- zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0);
- if (!zdb_sig_user_data && strncmp(optarg, "all:", 4))
- usage();
- endstr = strchr(optarg, ':') + 1;
- if (strcmp(endstr, "fletcher2") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
- else if (strcmp(endstr, "fletcher4") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4;
- else if (strcmp(endstr, "sha256") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
- else if (strcmp(endstr, "all") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
- else
- usage();
+ if (searchdirs == NULL) {
+ searchdirs = umem_alloc(sizeof (char *),
+ UMEM_NOFAIL);
+ } else {
+ char **tmp = umem_alloc((nsearch + 1) *
+ sizeof (char *), UMEM_NOFAIL);
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ umem_free(searchdirs,
+ nsearch * sizeof (char *));
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = optarg;
break;
case 't':
- ub_max_txg = strtoull(optarg, NULL, 0);
- if (ub_max_txg < TXG_INITIAL) {
+ max_txg = strtoull(optarg, NULL, 0);
+ if (max_txg < TXG_INITIAL) {
(void) fprintf(stderr, "incorrect txg "
"specified: %s\n", optarg);
usage();
}
break;
+ case 'U':
+ spa_config_path = optarg;
+ break;
default:
usage();
break;
}
}
- if (vdev_dir != NULL && exported == 0) {
+ if (!dump_opt['e'] && searchdirs != NULL) {
(void) fprintf(stderr, "-p option requires use of -e\n");
usage();
}
@@ -2397,18 +3018,26 @@ main(int argc, char **argv)
g_zfs = libzfs_init();
ASSERT(g_zfs != NULL);
+ if (dump_all)
+ verbose = MAX(verbose, 1);
+
for (c = 0; c < 256; c++) {
- if (dump_all && c != 'l' && c != 'R')
+ if (dump_all && !strchr("elAFLRSXP", c))
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
}
+ aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+ zfs_recover = (dump_opt['A'] > 1);
+
argc -= optind;
argv += optind;
+ if (argc < 2 && dump_opt['R'])
+ usage();
if (argc < 1) {
- if (dump_opt['C']) {
+ if (!dump_opt['e'] && dump_opt['C']) {
dump_cachefile(spa_config_path);
return (0);
}
@@ -2420,99 +3049,107 @@ main(int argc, char **argv)
return (0);
}
- if (dump_opt['R']) {
- flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
- flagbits['c'] = ZDB_FLAG_CHECKSUM;
- flagbits['d'] = ZDB_FLAG_DECOMPRESS;
- flagbits['e'] = ZDB_FLAG_BSWAP;
- flagbits['g'] = ZDB_FLAG_GBH;
- flagbits['i'] = ZDB_FLAG_INDIRECT;
- flagbits['p'] = ZDB_FLAG_PHYS;
- flagbits['r'] = ZDB_FLAG_RAW;
-
- spa = NULL;
- while (argv[0]) {
- zdb_read_block(argv[0], &spa);
- argv++;
- argc--;
- }
- if (spa)
- spa_close(spa, (void *)zdb_read_block);
- return (0);
- }
+ if (dump_opt['X'] || dump_opt['F'])
+ rewind = ZPOOL_DO_REWIND |
+ (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
- if (dump_opt['C'])
- dump_config(argv[0]);
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+ nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
+ fatal("internal error: %s", strerror(ENOMEM));
error = 0;
- if (exported) {
- /*
- * Check to see if the name refers to an exported zpool
- */
- char *slash;
- nvlist_t *exported_conf = NULL;
-
- if ((slash = strchr(argv[0], '/')) != NULL)
- *slash = '\0';
-
- error = find_exported_zpool(argv[0], &exported_conf, vdev_dir);
- if (error == 0) {
- nvlist_t *nvl = NULL;
-
- if (vdev_dir != NULL) {
- if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
- error = ENOMEM;
- else if (nvlist_add_string(nvl,
- zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
- vdev_dir) != 0)
- error = ENOMEM;
- }
+ target = argv[0];
- if (error == 0)
- error = spa_import_verbatim(argv[0],
- exported_conf, nvl);
+ if (dump_opt['e']) {
+ nvlist_t *cfg = NULL;
+ char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
- nvlist_free(nvl);
+ error = ENOENT;
+ if (name) {
+ if (dump_opt['C'] > 1) {
+ (void) printf("\nConfiguration for import:\n");
+ dump_nvlist(cfg, 8);
+ }
+ if (nvlist_add_nvlist(cfg,
+ ZPOOL_REWIND_POLICY, policy) != 0) {
+ fatal("can't open '%s': %s",
+ target, strerror(ENOMEM));
+ }
+ if ((error = spa_import(name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG)) != 0) {
+ error = spa_import(name, cfg, NULL,
+ ZFS_IMPORT_VERBATIM);
+ }
}
-
- if (slash != NULL)
- *slash = '/';
}
if (error == 0) {
- if (strchr(argv[0], '/') != NULL) {
- error = dmu_objset_open(argv[0], DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os);
+ if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+ error = spa_open_rewind(target, &spa, FTAG, policy,
+ NULL);
+ if (error) {
+ /*
+ * If we're missing the log device then
+ * try opening the pool after clearing the
+ * log state.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL &&
+ spa->spa_log_state == SPA_LOG_MISSING) {
+ spa->spa_log_state = SPA_LOG_CLEAR;
+ error = 0;
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ if (!error) {
+ error = spa_open_rewind(target, &spa,
+ FTAG, policy, NULL);
+ }
+ }
} else {
- error = spa_open(argv[0], &spa, FTAG);
+ error = dmu_objset_own(target, DMU_OST_ANY,
+ B_TRUE, FTAG, &os);
}
}
+ nvlist_free(policy);
if (error)
- fatal("can't open %s: %s", argv[0], strerror(error));
+ fatal("can't open '%s': %s", target, strerror(error));
argv++;
- if (--argc > 0) {
- zopt_objects = argc;
- zopt_object = calloc(zopt_objects, sizeof (uint64_t));
- for (i = 0; i < zopt_objects; i++) {
- errno = 0;
- zopt_object[i] = strtoull(argv[i], NULL, 0);
- if (zopt_object[i] == 0 && errno != 0)
- fatal("bad object number %s: %s",
- argv[i], strerror(errno));
+ argc--;
+ if (!dump_opt['R']) {
+ if (argc > 0) {
+ zopt_objects = argc;
+ zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+ for (i = 0; i < zopt_objects; i++) {
+ errno = 0;
+ zopt_object[i] = strtoull(argv[i], NULL, 0);
+ if (zopt_object[i] == 0 && errno != 0)
+ fatal("bad number %s: %s",
+ argv[i], strerror(errno));
+ }
}
- }
-
- if (os != NULL) {
- dump_dir(os);
- dmu_objset_close(os);
+ (os != NULL) ? dump_dir(os) : dump_zpool(spa);
} else {
- dump_zpool(spa);
- spa_close(spa, FTAG);
+ flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+ flagbits['c'] = ZDB_FLAG_CHECKSUM;
+ flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+ flagbits['e'] = ZDB_FLAG_BSWAP;
+ flagbits['g'] = ZDB_FLAG_GBH;
+ flagbits['i'] = ZDB_FLAG_INDIRECT;
+ flagbits['p'] = ZDB_FLAG_PHYS;
+ flagbits['r'] = ZDB_FLAG_RAW;
+
+ for (i = 0; i < argc; i++)
+ zdb_read_block(argv[i], spa);
}
+ (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
+
fuid_table_destroy();
+ sa_loaded = B_FALSE;
libzfs_fini(g_zfs);
kernel_fini();
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
index 1b3c18fab1c2..a0ed985f52b7 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
@@ -40,12 +40,14 @@
extern uint8_t dump_opt[256];
+static char prefix[4] = "\t\t\t";
+
static void
print_log_bp(const blkptr_t *bp, const char *prefix)
{
char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ sprintf_blkptr(blkbuf, bp);
(void) printf("%s%s\n", prefix, blkbuf);
}
@@ -54,19 +56,29 @@ static void
zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
{
time_t crtime = lr->lr_crtime[0];
- char *name = (char *)(lr + 1);
- char *link = name + strlen(name) + 1;
+ char *name, *link;
+ lr_attr_t *lrattr;
- if (txtype == TX_SYMLINK)
- (void) printf("\t\t\t%s -> %s\n", name, link);
- else
- (void) printf("\t\t\t%s\n", name);
+ name = (char *)(lr + 1);
+
+ if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+ lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+ lrattr = (lr_attr_t *)(lr + 1);
+ name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ }
+
+ if (txtype == TX_SYMLINK) {
+ link = name + strlen(name) + 1;
+ (void) printf("%s%s -> %s\n", prefix, name, link);
+ } else if (txtype != TX_MKXATTR) {
+ (void) printf("%s%s\n", prefix, name);
+ }
- (void) printf("\t\t\t%s", ctime(&crtime));
- (void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+ (void) printf("%s%s", prefix, ctime(&crtime));
+ (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
(longlong_t)lr->lr_mode);
- (void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
(u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
(u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
}
@@ -75,7 +87,7 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
static void
zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
{
- (void) printf("\t\t\tdoid %llu, name %s\n",
+ (void) printf("%sdoid %llu, name %s\n", prefix,
(u_longlong_t)lr->lr_doid, (char *)(lr + 1));
}
@@ -83,7 +95,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
static void
zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
{
- (void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+ (void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
(char *)(lr + 1));
}
@@ -95,9 +107,9 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
char *snm = (char *)(lr + 1);
char *tnm = snm + strlen(snm) + 1;
- (void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+ (void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
(u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
- (void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+ (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
}
/* ARGSUSED */
@@ -106,44 +118,48 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
char *data, *dlimit;
blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
- (void) printf("\t\t\tfoid %llu, offset 0x%llx,"
- " length 0x%llx, blkoff 0x%llx\n",
- (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
- (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+ (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
if (txtype == TX_WRITE2 || verbose < 5)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
- (void) printf("\t\t\thas blkptr, %s\n",
+ (void) printf("%shas blkptr, %s\n", prefix,
bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
- print_log_bp(bp, "\t\t\t");
+ print_log_bp(bp, prefix);
+
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
}
if (bp->blk_birth == 0) {
bzero(buf, sizeof (buf));
- } else {
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lr->lr_foid;
- zb.zb_level = 0;
- zb.zb_blkid = -1; /* unknown */
-
- error = zio_wait(zio_read(NULL, zilog->zl_spa,
- bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
- if (error)
- return;
+ (void) printf("%s<hole>\n", prefix);
+ return;
+ }
+ if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+ (void) printf("%s<block already committed>\n", prefix);
+ return;
}
- data = buf + lr->lr_blkoff;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = zio_wait(zio_read(NULL, zilog->zl_spa,
+ bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+ if (error)
+ return;
+ data = buf;
} else {
data = (char *)(lr + 1);
}
@@ -151,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
dlimit = data + MIN(lr->lr_length,
(verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
- (void) printf("\t\t\t");
+ (void) printf("%s", prefix);
while (data < dlimit) {
if (isprint(*data))
(void) printf("%c ", *data);
@@ -166,7 +182,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
static void
zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
{
- (void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+ (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
(u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length);
}
@@ -178,38 +194,38 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
time_t atime = (time_t)lr->lr_atime[0];
time_t mtime = (time_t)lr->lr_mtime[0];
- (void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+ (void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
if (lr->lr_mask & AT_MODE) {
- (void) printf("\t\t\tAT_MODE %llo\n",
+ (void) printf("%sAT_MODE %llo\n", prefix,
(longlong_t)lr->lr_mode);
}
if (lr->lr_mask & AT_UID) {
- (void) printf("\t\t\tAT_UID %llu\n",
+ (void) printf("%sAT_UID %llu\n", prefix,
(u_longlong_t)lr->lr_uid);
}
if (lr->lr_mask & AT_GID) {
- (void) printf("\t\t\tAT_GID %llu\n",
+ (void) printf("%sAT_GID %llu\n", prefix,
(u_longlong_t)lr->lr_gid);
}
if (lr->lr_mask & AT_SIZE) {
- (void) printf("\t\t\tAT_SIZE %llu\n",
+ (void) printf("%sAT_SIZE %llu\n", prefix,
(u_longlong_t)lr->lr_size);
}
if (lr->lr_mask & AT_ATIME) {
- (void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+ (void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
(u_longlong_t)lr->lr_atime[0],
(u_longlong_t)lr->lr_atime[1],
ctime(&atime));
}
if (lr->lr_mask & AT_MTIME) {
- (void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+ (void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
(u_longlong_t)lr->lr_mtime[0],
(u_longlong_t)lr->lr_mtime[1],
ctime(&mtime));
@@ -220,7 +236,7 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
static void
zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
{
- (void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+ (void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
}
@@ -256,7 +272,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
};
/* ARGSUSED */
-static void
+static int
print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
{
int txtype;
@@ -280,23 +296,24 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
zil_rec_info[txtype].zri_count++;
zil_rec_info[0].zri_count++;
+
+ return (0);
}
/* ARGSUSED */
-static void
+static int
print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- char blkbuf[BP_SPRINTF_LEN];
+ char blkbuf[BP_SPRINTF_LEN + 10];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
char *claim;
if (verbose <= 3)
- return;
+ return (0);
if (verbose >= 5) {
(void) strcpy(blkbuf, ", ");
- sprintf_blkptr(blkbuf + strlen(blkbuf),
- BP_SPRINTF_LEN - strlen(blkbuf), bp);
+ sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
} else {
blkbuf[0] = '\0';
}
@@ -310,6 +327,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
(void) printf("\tBlock seqno %llu, %s%s\n",
(u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+ return (0);
}
static void
@@ -342,17 +361,17 @@ dump_intent_log(zilog_t *zilog)
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int i;
- if (zh->zh_log.blk_birth == 0 || verbose < 2)
+ if (zh->zh_log.blk_birth == 0 || verbose < 1)
return;
- (void) printf("\n ZIL header: claim_txg %llu, claim_seq %llu",
- (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+ (void) printf("\n ZIL header: claim_txg %llu, "
+ "claim_blk_seq %llu, claim_lr_seq %llu",
+ (u_longlong_t)zh->zh_claim_txg,
+ (u_longlong_t)zh->zh_claim_blk_seq,
+ (u_longlong_t)zh->zh_claim_lr_seq);
(void) printf(" replay_seq %llu, flags 0x%llx\n",
(u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
- if (verbose >= 4)
- print_log_bp(&zh->zh_log, "\n\tfirst block: ");
-
for (i = 0; i < TX_MAX_TYPE; i++)
zil_rec_info[i].zri_count = 0;
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8
index 0d97026a4a43..0d40a9083c7e 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8
@@ -6,7 +6,7 @@
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with
.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH zfs 1M "5 May 2009" "SunOS 5.11" "System Administration Commands"
+.TH zfs 1M "24 Sep 2009" "SunOS 5.11" "System Administration Commands"
.SH NAME
zfs \- configures ZFS file systems
.SH SYNOPSIS
@@ -27,7 +27,12 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBdestroy\fR [\fB-rRd\fR] \fIsnapshot\fR
.fi
.LP
@@ -75,7 +80,7 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|snapshot ...
+\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...
.fi
.LP
@@ -174,7 +179,7 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBallow\fR \fB-s\fR @setname \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR
+\fBzfs\fR \fBallow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR
.fi
.LP
@@ -195,7 +200,29 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @setname [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR
+\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBhold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBholds\fR [\fB-r\fR] \fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrelease\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...
+.fi
+
+\fBzfs\fR \fBjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
.fi
.SH DESCRIPTION
@@ -212,7 +239,7 @@ pool/{filesystem,volume,snapshot}
.sp
.LP
-\&...where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
+where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
.sp
.LP
A dataset can be one of the following:
@@ -224,7 +251,7 @@ A dataset can be one of the following:
.ad
.sp .6
.RS 4n
-A \fBZFS\fR dataset of type "filesystem" that can be mounted within the standard system namespace and behaves like other file systems. While \fBZFS\fR file systems are designed to be \fBPOSIX\fR compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space.
+A \fBZFS\fR dataset of type \fBfilesystem\fR can be mounted within the standard system namespace and behaves like other file systems. While \fBZFS\fR file systems are designed to be \fBPOSIX\fR compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space.
.RE
.sp
@@ -268,17 +295,17 @@ A snapshot is a read-only copy of a file system or volume. Snapshots can be crea
Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently.
.sp
.LP
-File system snapshots can be accessed under the ".zfs/snapshot" directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the ".zfs" directory can be controlled by the "snapdir" property.
+File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the \fB\&.zfs\fR directory can be controlled by the \fBsnapdir\fR property.
.SS "Clones"
.sp
.LP
A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space.
.sp
.LP
-Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The "origin" property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
+Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The \fBorigin\fR property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
.sp
.LP
-The clone parent-child dependency relationship can be reversed by using the "\fBpromote\fR" subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from.
+The clone parent-child dependency relationship can be reversed by using the \fBpromote\fR subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from.
.SS "Mount Points"
.sp
.LP
@@ -304,10 +331,10 @@ A \fBZFS\fR file system can be added to a non-global zone by using the \fBzonecf
The physical properties of an added file system are controlled by the global administrator. However, the zone administrator can create, modify, or destroy files within the added file system, depending on how the file system is mounted.
.sp
.LP
-A dataset can also be delegated to a non-global zone by using \fBzonecfg\fR \fBadd dataset\fR subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or any of its children. However, the \fBquota\fR property is controlled by the global administrator.
+A dataset can also be delegated to a non-global zone by using the \fBzonecfg\fR \fBadd dataset\fR subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or any of its children. However, the \fBquota\fR property is controlled by the global administrator.
.sp
.LP
-A \fBZFS\fR volume can be added as a device to a non-global zone by using \fBzonecfg\fR \fBadd device\fR subcommand. However, its physical properties can be modified only by the global administrator.
+A \fBZFS\fR volume can be added as a device to a non-global zone by using the \fBzonecfg\fR \fBadd device\fR subcommand. However, its physical properties can be modified only by the global administrator.
.sp
.LP
For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M).
@@ -320,7 +347,7 @@ The global administrator can forcibly clear the \fBzoned\fR property, though thi
.SS "Native Properties"
.sp
.LP
-Properties are divided into two types, native and user-defined (or "user"). Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section, below.
+Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section, below.
.sp
.LP
Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots).
@@ -380,6 +407,17 @@ The time this dataset was created.
.ne 2
.mk
.na
+\fB\fBdefer_destroy\fR\fR
+.ad
+.sp .6
+.RS 4n
+This property is \fBon\fR if the snapshot has been marked for deferred destroy by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is \fBoff\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fBmounted\fR\fR
.ad
.sp .6
@@ -489,7 +527,7 @@ The amount of space used by a \fBrefreservation\fR set on this dataset, which wo
.ad
.sp .6
.RS 4n
-The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR properties because space can be shared by multiple snapshots
+The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR properties because space can be shared by multiple snapshots.
.RE
.sp
@@ -500,34 +538,34 @@ The amount of space consumed by snapshots of this dataset. In particular, it is
.ad
.sp .6
.RS 4n
-The amount of space referenced in this dataset by the specified user. Space is charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the \fBzfs userspace\fR subcommand for more information.
+The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the \fBzfs userspace\fR subcommand for more information.
.sp
Unprivileged users can access only their own space usage. The root user, or a user who has been granted the \fBuserused\fR privilege with \fBzfs allow\fR, can access everyone's usage.
.sp
-This property cannot be set on volumes, or on pools before version 15. The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
+The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
.RS +4
.TP
.ie t \(bu
.el o
-\fIposix name\fR (for example, \fBjoe\fR)
+\fIPOSIX name\fR (for example, \fBjoe\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIposix numeric id\fR (for example, \fB789\fR)
+\fIPOSIX numeric ID\fR (for example, \fB789\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIsid name\fR (for example, \fBjoe.smith@mydomain\fR)
+\fISID name\fR (for example, \fBjoe.smith@mydomain\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIsid numeric id\fR (for example, \fBS-1-123-456-789\fR)
+\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR)
.RE
.RE
@@ -535,13 +573,24 @@ This property cannot be set on volumes, or on pools before version 15. The \fBus
.ne 2
.mk
.na
+\fB\fBuserrefs\fR\fR
+.ad
+.sp .6
+.RS 4n
+This property is set to the number of user holds on this snapshot. User holds are set by using the \fBzfs hold\fR command.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fBgroupused@\fR\fIgroup\fR\fR
.ad
.sp .6
.RS 4n
-The amount of space referenced in this dataset by the specified group. Space is charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the \fBuserused@\fR\fIuser\fR property for more information.
+The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the \fBuserused@\fR\fIuser\fR property for more information.
.sp
-Unprivileged users can only access the \fBgroupused@\fR... property for groups that they are a member of. The root user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs allow\fR, can access all groups' usage.
+Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs allow\fR, can access all groups' usage.
.RE
.sp
@@ -618,7 +667,9 @@ This property is not inherited.
.ad
.sp .6
.RS 4n
-Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher2\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice.
+Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice.
+.sp
+Changing this property affects only newly-written data.
.RE
.sp
@@ -629,22 +680,22 @@ Controls the checksum used to verify data integrity. The default value is \fBon\
.ad
.sp .6
.RS 4n
-Controls the compression algorithm used for this dataset. The \fBlzjb\fR compression algorithm is optimized for performance while providing decent data compression. Setting compression to "on" uses the "lzjb" compression algorithm. The "gzip" compression algorithm uses the same compression as the \fBgzip\fR(1) command. You can specify the "gzip" level by using the value "gzip-\fIN\fR" where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, "gzip" is equivalent to "gzip-6" (which is also the default for \fBgzip\fR(1)).
+Controls the compression algorithm used for this dataset. The \fBlzjb\fR compression algorithm is optimized for performance while providing decent data compression. Setting compression to \fBon\fR uses the \fBlzjb\fR compression algorithm. The \fBgzip\fR compression algorithm uses the same compression as the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR (which is also the default for \fBgzip\fR(1)).
.sp
-This property can also be referred to by its shortened column name "compress".
+This property can also be referred to by its shortened column name \fBcompress\fR. Changing this property affects only newly-written data.
.RE
.sp
.ne 2
.mk
.na
-\fBcopies=\fB1\fR | \fB2\fR | \fB3\fR\fR
+\fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR
.ad
.sp .6
.RS 4n
-Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or \fBraid-z\fR. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the \fBused\fR property and counting against quotas and reservations.
+Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the \fBused\fR property and counting against quotas and reservations.
.sp
-Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the \fB-o\fR \fBcopies=\fR option.
+Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the \fB-o\fR \fBcopies=\fR\fIN\fR option.
.RE
.sp
@@ -725,36 +776,36 @@ Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an implic
.ad
.sp .6
.RS 4n
-Limits the amount of space referenced by the specified user, which is specified by the \fBuserspace@\fR\fIuser\fR property.
+Limits the amount of space consumed by the specified user. User space consumption is identified by the \fBuserspace@\fR\fIuser\fR property.
.sp
-Enforcement of user quotas may be delayed by several seconds. In other words, users may go a bit over their quota before the system notices that they are over quota and begins to refuse additional writes with \fBEDQUOT\fR. See the \fBzfs userspace\fR subcommand for more information.
+Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the \fBEDQUOT\fR error message . See the \fBzfs userspace\fR subcommand for more information.
.sp
-Unprivileged users can get only their own quota. The root user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs allow\fR, can get and set everyone's quota.
+Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs allow\fR, can get and set everyone's quota.
.sp
-This property cannot be set on volumes, on filesystems before version 4, or on pools before version 15. The \fBuserquota@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
+This property is not available on volumes, on file systems before version 4, or on pools before version 15. The \fBuserquota@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
.RS +4
.TP
.ie t \(bu
.el o
-\fIposix name\fR (for example, \fBjoe\fR)
+\fIPOSIX name\fR (for example, \fBjoe\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIposix numeric id\fR (for example, \fB789\fR)
+\fIPOSIX numeric ID\fR (for example, \fB789\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIsid name\fR (for example, \fBjoe.smith@mydomain\fR)
+\fISID name\fR (for example, \fBjoe.smith@mydomain\fR)
.RE
.RS +4
.TP
.ie t \(bu
.el o
-\fIsid numeric id\fR (for example, \fBS-1-123-456-789\fR)
+\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR)
.RE
.RE
@@ -766,9 +817,9 @@ This property cannot be set on volumes, on filesystems before version 4, or on p
.ad
.sp .6
.RS 4n
-Limits the amount of space referenced by the specified group. See the \fBuserquota@\fR\fIuser\fR property for more information.
+Limits the amount of space consumed by the specified group. Group space consumption is identified by the \fBuserquota@\fR\fIuser\fR property.
.sp
-Unprivileged users can only get the quota of groups they are a member of. The root user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs allow\fR, can get and set all groups' quotas.
+Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs allow\fR, can get and set all groups' quotas.
.RE
.sp
@@ -904,7 +955,18 @@ When the \fBsharesmb\fR property is changed for a dataset, the dataset and any c
.RS 4n
Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a \fBsharenfs\fR property of \fBoff\fR is managed through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and \fBdfstab\fR(4). Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the \fBshare\fR(1M) command is invoked with no options. Otherwise, the \fBshare\fR(1M) command is invoked with options equivalent to the contents of this property.
.sp
-When the \fBsharenfs\fR property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously "off", or if they were shared before the property was changed. If the new property is \fBoff\fR, the file systems are unshared.
+When the \fBsharenfs\fR property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously \fBoff\fR, or if they were shared before the property was changed. If the new property is \fBoff\fR, the file systems are unshared.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBlogbias\fR = \fBlatency\fR | \fBthroughput\fR\fR
+.ad
+.sp .6
+.RS 4n
+Provide a hint to ZFS about handling of synchronous requests in this dataset. If \fBlogbias\fR is set to \fBlatency\fR (the default), ZFS will use pool log devices (if configured) to handle the requests at low latency. If \fBlogbias\fR is set to \fBthroughput\fR, ZFS will not use configured pool log devices. ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources.
.RE
.sp
@@ -959,7 +1021,7 @@ Controls whether regular files should be scanned for viruses when a file is open
.ne 2
.mk
.na
-\fBxattr=\fBon\fR | \fBoff\fR\fR
+\fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR
.ad
.sp .6
.RS 4n
@@ -997,7 +1059,7 @@ The \fBmixed\fR value for the \fBcasesensitivity\fR property indicates that the
.ne 2
.mk
.na
-\fB\fBnormalization\fR=\fBnone\fR | \fBformD\fR | \fBformKCf\fR\fR
+\fB\fBnormalization\fR = \fBnone\fR | \fBformC\fR | \fBformD\fR | \fBformKC\fR | \fBformKD\fR\fR
.ad
.sp .6
.RS 4n
@@ -1008,6 +1070,17 @@ Indicates whether the file system should perform a \fBunicode\fR normalization o
.ne 2
.mk
.na
+\fBjailed =\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the dataset is managed from within a jail. The default value is "off".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fButf8only\fR=\fBon\fR | \fBoff\fR\fR
.ad
.sp .6
@@ -1081,7 +1154,7 @@ Displays a help message.
.ad
.sp .6
.RS 4n
-Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the "mountpoint" property inherited from the parent.
+Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the \fBmountpoint\fR property inherited from the parent.
.sp
.ne 2
.mk
@@ -1101,7 +1174,7 @@ Creates all the non-existing parent datasets. Datasets created in this manner ar
.ad
.sp .6
.RS 4n
-Sets the specified property as if the command \fBzfs set \fIproperty\fR=\fIvalue\fR\fR was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
+Sets the specified property as if the command \fBzfs set\fR \fIproperty\fR=\fIvalue\fR was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
.RE
.RE
@@ -1114,7 +1187,7 @@ Sets the specified property as if the command \fBzfs set \fIproperty\fR=\fIvalue
.ad
.sp .6
.RS 4n
-Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fIpath\fR\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created.
+Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fR\fIpath\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created.
.sp
\fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR.
.sp
@@ -1147,7 +1220,7 @@ Creates a sparse volume with no reservation. See \fBvolsize\fR in the Native Pro
.ad
.sp .6
.RS 4n
-Sets the specified property as if the \fBzfs set \fIproperty\fR=\fIvalue\fR\fR command was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
+Sets the specified property as if the \fBzfs set\fR \fIproperty\fR=\fIvalue\fR command was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
.RE
.sp
@@ -1158,7 +1231,7 @@ Sets the specified property as if the \fBzfs set \fIproperty\fR=\fIvalue\fR\fR c
.ad
.sp .6
.RS 4n
-Equivalent to \fB\fR\fB-o\fR \fBvolblocksize=\fIblocksize\fR\fR. If this option is specified in conjunction with \fB-o\fR \fBvolblocksize\fR, the resulting behavior is undefined.
+Equivalent to \fB-o\fR \fBvolblocksize\fR=\fIblocksize\fR. If this option is specified in conjunction with \fB-o\fR \fBvolblocksize\fR, the resulting behavior is undefined.
.RE
.RE
@@ -1167,11 +1240,11 @@ Equivalent to \fB\fR\fB-o\fR \fBvolblocksize=\fIblocksize\fR\fR. If this option
.ne 2
.mk
.na
-\fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+\fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR\fR
.ad
.sp .6
.RS 4n
-Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children, snapshots, clones).
+Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children or clones).
.sp
.ne 2
.mk
@@ -1180,7 +1253,7 @@ Destroys the given dataset. By default, the command unshares any file systems th
.ad
.sp .6
.RS 4n
-Recursively destroy all children. If a snapshot is specified, destroy all snapshots with this name in descendent file systems.
+Recursively destroy all children.
.RE
.sp
@@ -1191,7 +1264,7 @@ Recursively destroy all children. If a snapshot is specified, destroy all snapsh
.ad
.sp .6
.RS 4n
-Recursively destroy all dependents, including cloned file systems outside the target hierarchy. If a snapshot is specified, destroy all snapshots with this name in descendent file systems.
+Recursively destroy all dependents, including cloned file systems outside the target hierarchy.
.RE
.sp
@@ -1212,6 +1285,52 @@ Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR o
.ne 2
.mk
.na
+\fB\fBzfs destroy\fR [\fB-rRd\fR] \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+The given snapshot is destroyed immediately if and only if the \fBzfs destroy\fR command without the \fB-d\fR option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero.
+.sp
+If the snapshot does not qualify for immediate destruction, it is marked for deferred deletion. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR\fR
+.ad
+.sp .6
+.RS 4n
+Defer snapshot deletion.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Destroy (or mark for deferred deletion) all snapshots with this name in descendent file systems.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively destroy all dependents.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR
.ad
.sp .6
@@ -1251,6 +1370,8 @@ Sets the specified property; see \fBzfs create\fR for details.
.RS 4n
Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than the most recent one. In order to do so, all intermediate snapshots must be destroyed by specifying the \fB-r\fR option.
.sp
+The \fB-rR\fR options do not recursively destroy the child snapshots of a recursive snapshot. Only the top-level recursive snapshot is destroyed by either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots.
+.sp
.ne 2
.mk
.na
@@ -1380,15 +1501,7 @@ Recursively rename the snapshots of all descendent datasets. Snapshots are the o
.ad
.sp .6
.RS 4n
-Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . The following fields are displayed:
-.sp
-.in +2
-.nf
-name,used,available,referenced,mountpoint
-.fi
-.in -2
-.sp
-
+Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . The following fields are displayed, \fBname,used,available,referenced,mountpoint\fR.
.sp
.ne 2
.mk
@@ -1397,7 +1510,7 @@ name,used,available,referenced,mountpoint
.ad
.sp .6
.RS 4n
-Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary whitespace.
+Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space.
.RE
.sp
@@ -1435,34 +1548,25 @@ A comma-separated list of properties to display. The property must be:
.TP
.ie t \(bu
.el o
-one of the properties described in the "Native Properties" section
+One of the properties described in the "Native Properties" section
.RE
.RS +4
.TP
.ie t \(bu
.el o
-a user property
+A user property
.RE
.RS +4
.TP
.ie t \(bu
.el o
-the value \fBname\fR to display the dataset name
+The value \fBname\fR to display the dataset name
.RE
.RS +4
.TP
.ie t \(bu
.el o
-the value \fBspace\fR to display space usage properties on file systems and volumes. This is a shortcut for:
-.sp
-.in +2
-.nf
--o name,avail,used,usedsnap,usedds,usedrefreserv,\e
-usedchild -t filesystem,volume
-.fi
-.in -2
-.sp
-
+The value \fBspace\fR to display space usage properties on file systems and volumes. This is a shortcut for specifying \fB-o name,avail,used,usedsnap,usedds,usedrefreserv,usedchild\fR \fB-t filesystem,volume\fR syntax.
.RE
.RE
@@ -1474,7 +1578,7 @@ usedchild -t filesystem,volume
.ad
.sp .6
.RS 4n
-A property to use for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value \fBname\fR to sort by the dataset name. Multiple properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance.
+A property for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value \fBname\fR to sort by the dataset name. Multiple properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance.
.sp
The following is a list of sorting criteria:
.RS +4
@@ -1535,7 +1639,7 @@ A comma-separated list of types to display, where \fItype\fR is one of \fBfilesy
.ad
.sp .6
.RS 4n
-Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable form with a suffix of \fBB\fR, \fBK\fR, \fBM\fR, \fBG\fR, \fBT\fR, \fBP\fR, \fBE\fR, \fBZ\fR (for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). Properties cannot be set on snapshots.
+Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable form with a suffix of \fBB\fR, \fBK\fR, \fBM\fR, \fBG\fR, \fBT\fR, \fBP\fR, \fBE\fR, \fBZ\fR (for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). User properties can be set on snapshots. For more information, see the "User Properties" section.
.RE
.sp
@@ -1671,11 +1775,11 @@ Displays a list of file systems that are not the most recent version.
.ad
.sp .6
.RS 4n
-Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. \fBzfs send\fR streams generated from new snapshots of these file systems can not be accessed on systems running older versions of the software.
+Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. \fBzfs send\fR streams generated from new snapshots of these file systems cannot be accessed on systems running older versions of the software.
.sp
-The file system version is independent of the pool version (see \fBzpool\fR(1M) for information on the \fBzpool upgrade\fR command).
+In general, the file system version is independent of the pool version. See \fBzpool\fR(1M) for information on the \fBzpool upgrade\fR command.
.sp
-The file system version does not have to be upgraded when the pool version is upgraded, and vice-versa.
+In some cases, the file system version and the pool version are interrelated and the pool version must be upgraded before the file system version can be upgraded.
.sp
.ne 2
.mk
@@ -1772,16 +1876,7 @@ Use exact (parseable) numeric output.
.ad
.sp .6
.RS 4n
-Display only the specified fields, from the following set:
-.sp
-.in +2
-.nf
-type,name,used,quota
-.fi
-.in -2
-.sp
-
-The default is to display all fields.
+Display only the specified fields from the following set, \fBtype,name,used,quota\fR.The default is to display all fields.
.RE
.sp
@@ -1792,15 +1887,7 @@ The default is to display all fields.
.ad
.sp .6
.RS 4n
-Sort output by this field. The \fIs\fR and \fIS\fR flags may be specified multiple times to sort first by one field, then by another. The default is:
-.sp
-.in +2
-.nf
--s type -s name
-.fi
-.in -2
-.sp
-
+Sort output by this field. The \fIs\fR and \fIS\fR flags may be specified multiple times to sort first by one field, then by another. The default is \fB-s type\fR \fB-s name\fR.
.RE
.sp
@@ -1822,25 +1909,11 @@ Sort by this field in reverse order. See \fB-s\fR.
.ad
.sp .6
.RS 4n
-Print only the specified types, from the following set:
-.sp
-.in +2
-.nf
-all,posixuser,smbuser,posixgroup,smbgroup
-.fi
-.in -2
+Print only the specified types from the following set, \fBall,posixuser,smbuser,posixgroup,smbgroup\fR.
.sp
-
-The default is:
+The default is \fB-t posixuser,smbuser\fR
.sp
-.in +2
-.nf
--t posixuser,smbuser
-.fi
-.in -2
-.sp
-
-\&...but can be changed to include group types.
+The default can be changed to include group types.
.RE
.sp
@@ -1851,7 +1924,7 @@ The default is:
.ad
.sp .6
.RS 4n
-Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists. Normal POSIX interfaces (for example, \fBstat\fR(2), \fBls\fR \fB-l\fR) perform this translation, so the \fB-i\fR option allows the output from \fBzfs userspace\fR to be compared directly with those utilities. However, \fB-i\fR may lead to confusion if some files were created by an SMB user before a SMB-to-POSIX name mapping was established. In such a case, some files are owned by the SMB entity and some by the POSIX entity. However, he \fB-i\fR option will report that the POSIX entity has the total usage and quota for both.
+Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists. Normal POSIX interfaces (for example, \fBstat\fR(2), \fBls\fR \fB-l\fR) perform this translation, so the \fB-i\fR option allows the output from \fBzfs userspace\fR to be compared directly with those utilities. However, \fB-i\fR may lead to confusion if some files were created by an SMB user before a SMB-to-POSIX name mapping was established. In such a case, some files are owned by the SMB entity and some by the POSIX entity. However, the \fB-i\fR option will report that the POSIX entity has the total usage and quota for both.
.RE
.RE
@@ -1864,11 +1937,11 @@ Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists. N
.ad
.sp .6
.RS 4n
-Displays space consumed by, and quotas on, each group in the specified filesystem or snapshot. This subcommand is identical to \fBzfs userspace\fR, except that the default types to display are:
+Displays space consumed by, and quotas on, each group in the specified filesystem or snapshot. This subcommand is identical to \fBzfs userspace\fR, except that the default types to display are \fB-t posixgroup,smbgroup\fR.
.sp
.in +2
.nf
--t posixgroup,smbgroup
+-
.fi
.in -2
.sp
@@ -2119,7 +2192,7 @@ If the \fB-i\fR or \fB-I\fR flags are used in conjunction with the \fB-R\fR flag
Print verbose information about the stream package generated.
.RE
-The format of the stream is evolving. No backwards compatibility is guaranteed. You may not be able to receive your streams on future versions of \fBZFS\fR.
+The format of the stream is committed. You will be able to receive your streams on future versions of \fBZFS\fR.
.RE
.sp
@@ -2138,6 +2211,8 @@ Creates a snapshot whose contents are as specified in the stream provided on sta
.sp
If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For \fBzvols\fR, the destination device link is destroyed and recreated, which means the \fBzvol\fR cannot be accessed during the \fBreceive\fR operation.
.sp
+When a snapshot replication package stream that is generated by using the \fBzfs send\fR \fB-R\fR command is received, any snapshots that do not exist on the sending location are destroyed by using the \fBzfs destroy\fR \fB-d\fR command.
+.sp
The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the \fB-d\fR option.
.sp
If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR. If the \fB-d\fR option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified \fIfilesystem\fR. If the \fB-d\fR option is specified, any required file systems within the specified one are created.
@@ -2241,7 +2316,7 @@ Specifies to whom the permissions are delegated. Multiple entities can be specif
.ad
.sp .6
.RS 4n
-Specifies that the permissions be delegated to "everyone." Multiple permissions may be specified as a comma-separated list. Permission names are the same as \fBZFS\fR subcommand and property names. See the property list below. Property set names, which begin with an "at sign" ("@") , may be specified. See the \fB-s\fR form below for details.
+Specifies that the permissions be delegated to "everyone." Multiple permissions may be specified as a comma-separated list. Permission names are the same as \fBZFS\fR subcommand and property names. See the property list below. Property set names, which begin with an at sign (\fB@\fR) , may be specified. See the \fB-s\fR form below for details.
.RE
.sp
@@ -2263,67 +2338,63 @@ Permissions are generally the ability to use a \fBZFS\fR subcommand or change a
.sp
.in +2
.nf
-NAME TYPE NOTES
-allow subcommand Must also have the permission
- that is being allowed.
-clone subcommand Must also have the 'create' ability
- and the 'mount' ability in the origin
- file system.
-create subcommand Must also have the 'mount' ability.
-destroy subcommand Must also have the 'mount' ability.
-mount subcommand Allows mount, unmount, and
- create/remove zvol device links.
-promote subcommand Must also have the 'mount' ability and
- 'promote' ability in the origin file system.
-receive subcommand Must also have the 'mount' ability and
- the 'create' ability.
-rename subcommand Must also have the 'mount' ability and
- the 'create' ability in the new parent.
-rollback subcommand Must also have the 'mount' ability.
-snapshot subcommand Must also have the 'mount' ability.
-share subcommand Allows share and unshare.
-send subcommand
-
-
-aclinherit property
-aclmode property
-atime property
-canmount property
-casesensitivity property
-checksum property
-compression property
-copies property
-devices property
-exec property
-groupquota other Allows accessing any groupquota@... property.
-groupused other Allows reading any groupused@... property.
-mountpoint property
-nbmand property
-normalization property
-primarycache property
-quota property
-readonly property
-recordsize property
-refquota property
-refreservation property
-reservation property
-secondarycache property
-setuid property
-shareiscsi property
-sharenfs property
-sharesmb property
-snapdir property
-utf8only property
-userprop other Allows changing any user property.
-userquota other Allows accessing any userquota@... property.
-userused other Allows reading any userused@... property.
-version property
-volblocksize property
-volsize property
-vscan property
-xattr property
-zoned property
-userprop other Allows changing any user property.
+NAME TYPE NOTES
+allow subcommand Must also have the permission that is being
+ allowed
+clone subcommand Must also have the 'create' ability and 'mount'
+ ability in the origin file system
+create subcommand Must also have the 'mount' ability
+destroy subcommand Must also have the 'mount' ability
+mount subcommand Allows mount/umount of ZFS datasets
+promote subcommand Must also have the 'mount'
+ and 'promote' ability in the origin file system
+receive subcommand Must also have the 'mount' and 'create' ability
+rename subcommand Must also have the 'mount' and 'create'
+ ability in the new parent
+rollback subcommand Must also have the 'mount' ability
+send subcommand
+share subcommand Allows sharing file systems over NFS or SMB
+ protocols
+snapshot subcommand Must also have the 'mount' ability
+groupquota other Allows accessing any groupquota@... property
+groupused other Allows reading any groupused@... property
+userprop other Allows changing any user property
+userquota other Allows accessing any userquota@... property
+userused other Allows reading any userused@... property
+
+aclinherit property
+aclmode property
+atime property
+canmount property
+casesensitivity property
+checksum property
+compression property
+copies property
+devices property
+exec property
+mountpoint property
+nbmand property
+normalization property
+primarycache property
+quota property
+readonly property
+recordsize property
+refquota property
+refreservation property
+reservation property
+secondarycache property
+setuid property
+shareiscsi property
+sharenfs property
+sharesmb property
+snapdir property
+utf8only property
+version property
+volblocksize property
+volsize property
+vscan property
+xattr property
+zoned property
.fi
.in -2
.sp
@@ -2343,7 +2414,7 @@ Sets "create time" permissions. These permissions are granted (locally) to the c
.ne 2
.mk
.na
-\fB\fBzfs allow\fR \fB-s\fR @setname \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR
+\fB\fBzfs allow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR
.ad
.sp .6
.RS 4n
@@ -2388,7 +2459,7 @@ Recursively remove the permissions from this file system and all descendents.
.ne 2
.mk
.na
-\fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @setname [\fIperm\fR|@\fIsetname\fR[,...]]\fR
+\fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,...]]\fR
.ad
.br
.na
@@ -2399,12 +2470,101 @@ Recursively remove the permissions from this file system and all descendents.
Removes permissions from a permission set. If no permissions are specified, then all permissions are removed, thus removing the set entirely.
.RE
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs hold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Adds a single reference, named with the \fItag\fR argument, to the specified snapshot or snapshots. Each snapshot has its own tag namespace, and tags must be unique within that space.
+.sp
+If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs holds\fR [\fB-r\fR] \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Lists all existing user references for the given snapshot or snapshots.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Lists the holds that are set on the named descendent snapshots, in addition to listing the holds on the named snapshot.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs release\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Removes a single reference, named with the \fItag\fR argument, from the specified snapshot or snapshots. The tag must already exist for each snapshot.
+.sp
+If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively releases a hold with the given tag on the snapshots of all descendent file systems.
+.RE
+
+.RE
+
+\fB\fBzfs jail\fR \fIjailid\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Attaches the given file system to the given jail. From now on this file system tree can be managed from within a jail if the "\fBjailed\fR" property has been set.
+To use this functionality, sysctl \fBsecurity.jail.enforce_statfs\fR should be set to 0 and sysctl \fBsecurity.jail.mount_allowed\fR should be set to 1.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unjail\fR \fIjailid\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Detaches the given file system from the given jail.
+.RE
+
.SH EXAMPLES
.LP
\fBExample 1 \fRCreating a ZFS File System Hierarchy
.sp
.LP
-The following commands create a file system named \fBpool/home\fR and a file system named \fBpool/home/bob\fR. The mount point \fB/export/home\fR is set for the parent file system, and automatically inherited by the child file system.
+The following commands create a file system named \fBpool/home\fR and a file system named \fBpool/home/bob\fR. The mount point \fB/export/home\fR is set for the parent file system, and is automatically inherited by the child file system.
.sp
.in +2
@@ -2431,7 +2591,7 @@ The following command creates a snapshot named \fByesterday\fR. This snapshot is
.sp
.LP
-\fBExample 3 \fRTaking and Destroying Multiple Snapshots
+\fBExample 3 \fRCreating and Destroying Multiple Snapshots
.sp
.LP
The following command creates snapshots named \fByesterday\fR of \fBpool/home\fR and all of its descendent file systems. Each snapshot is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of its file system. The second command destroys the newly created snapshots.
@@ -2446,10 +2606,10 @@ The following command creates snapshots named \fByesterday\fR of \fBpool/home\fR
.sp
.LP
-\fBExample 4 \fRTurning Off Compression
+\fBExample 4 \fRDisabling and Enabling File System Compression
.sp
.LP
-The following commands turn compression off for all file systems under \fBpool/home\fR, but explicitly turns it on for \fBpool/home/anne\fR.
+The following command disables the \fBcompression\fR property for all file systems under \fBpool/home\fR. The next command explicitly enables \fBcompression\fR for \fBpool/home/anne\fR.
.sp
.in +2
@@ -2464,14 +2624,12 @@ The following commands turn compression off for all file systems under \fBpool/h
\fBExample 5 \fRListing ZFS Datasets
.sp
.LP
-The following command lists all active file systems and volumes in the system. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . See \fBzpool\fR(1M) for more information on pool properties.
+The following command lists all active file systems and volumes in the system. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR. The default is \fBoff\fR. See \fBzpool\fR(1M) for more information on pool properties.
.sp
.in +2
.nf
# \fBzfs list\fR
-
-
NAME USED AVAIL REFER MOUNTPOINT
pool 450K 457G 18K /pool
pool/home 315K 457G 21K /export/home
@@ -2505,25 +2663,21 @@ The following command lists all properties for \fBpool/home/bob\fR.
.in +2
.nf
# \fBzfs get all pool/home/bob\fR
-
-
NAME PROPERTY VALUE SOURCE
pool/home/bob type filesystem -
-pool/home/bob creation Thu Jul 12 14:44 2007 -
-pool/home/bob used 276K -
-pool/home/bob available 50.0G -
-pool/home/bob referenced 276K -
+pool/home/bob creation Tue Jul 21 15:53 2009 -
+pool/home/bob used 21K -
+pool/home/bob available 20.0G -
+pool/home/bob referenced 21K -
pool/home/bob compressratio 1.00x -
pool/home/bob mounted yes -
-pool/home/bob quota 50G local
+pool/home/bob quota 20G local
pool/home/bob reservation none default
pool/home/bob recordsize 128K default
-pool/home/bob mountpoint /export/home/bob inherited
- from
- pool/home
+pool/home/bob mountpoint /pool/home/bob default
pool/home/bob sharenfs off default
pool/home/bob checksum on default
-pool/home/bob compression off default
+pool/home/bob compression on local
pool/home/bob atime on default
pool/home/bob devices on default
pool/home/bob exec on default
@@ -2537,22 +2691,21 @@ pool/home/bob canmount on default
pool/home/bob shareiscsi off default
pool/home/bob xattr on default
pool/home/bob copies 1 default
-pool/home/bob version 1 -
+pool/home/bob version 4 -
pool/home/bob utf8only off -
pool/home/bob normalization none -
pool/home/bob casesensitivity sensitive -
pool/home/bob vscan off default
pool/home/bob nbmand off default
pool/home/bob sharesmb off default
-pool/home/bob refquota 10M local
+pool/home/bob refquota none default
pool/home/bob refreservation none default
pool/home/bob primarycache all default
-pool/home/bob secondarycache a default
+pool/home/bob secondarycache all default
pool/home/bob usedbysnapshots 0 -
-pool/home/bob usedbydataset 18K -
+pool/home/bob usedbydataset 21K -
pool/home/bob usedbychildren 0 -
pool/home/bob usedbyrefreservation 0 -
-
.fi
.in -2
.sp
@@ -2578,10 +2731,9 @@ The following command lists all properties with local settings for \fBpool/home/
.in +2
.nf
# \fBzfs get -r -s local -o name,property,value all pool/home/bob\fR
-
- NAME PROPERTY VALUE
- pool compression on
- pool/home checksum off
+NAME PROPERTY VALUE
+pool/home/bob quota 20G
+pool/home/bob compression on
.fi
.in -2
.sp
@@ -2669,7 +2821,7 @@ The following commands send a full stream and then an incremental stream to a re
.sp
.LP
-\fBExample 13 \fRUsing the \fBreceive\fR \fB-d\fR Option
+\fBExample 13 \fRUsing the \fBzfs receive\fR \fB-d\fR Option
.sp
.LP
The following command sends a full stream of \fBpoolA/fsA/fsB@snap\fR to a remote machine, receiving it into \fBpoolB/received/fsA/fsB@snap\fR. The \fBfsA/fsB@snap\fR portion of the received snapshot's name is determined from the name of the sent snapshot. \fBpoolB\fR must contain the file system \fBpoolB/received\fR. If \fBpoolB/received/fsA\fR does not exist, it is created as an empty file system.
@@ -2752,7 +2904,6 @@ The following commands show how to set \fBsharenfs\fR property options to enable
.in +2
.nf
# \fB# zfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home\fR
-
.fi
.in -2
.sp
@@ -2770,13 +2921,12 @@ The following example shows how to set permissions so that user \fBcindys\fR can
.sp
.in +2
.nf
-# \fB# zfs allow cindys create,destroy,mount,snapshot tank/cindys\fR
+# \fBzfs allow cindys create,destroy,mount,snapshot tank/cindys\fR
# \fBzfs allow tank/cindys\fR
-------------------------------------------------------------
Local+Descendent permissions on (tank/cindys)
user cindys create,destroy,mount,snapshot
-------------------------------------------------------------
-
.fi
.in -2
.sp
@@ -2853,8 +3003,8 @@ The following example shows to grant the ability to set quotas and reservations
Local+Descendent permissions on (users/home)
user cindys quota,reservation
-------------------------------------------------------------
-cindys% zfs set quota=10G users/home/marks
-cindys% zfs get quota users/home/marks
+cindys% \fBzfs set quota=10G users/home/marks\fR
+cindys% \fBzfs get quota users/home/marks\fR
NAME PROPERTY VALUE SOURCE
users/home/marks quota 10G local
.fi
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
index ca5c2b232786..e2ab90eaf14f 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <libintl.h>
@@ -107,7 +106,8 @@ zfs_callback(zfs_handle_t *zhp, void *data)
zfs_prune_proplist(zhp,
cb->cb_props_table);
- if (zfs_expand_proplist(zhp, cb->cb_proplist)
+ if (zfs_expand_proplist(zhp, cb->cb_proplist,
+ (cb->cb_flags & ZFS_ITER_RECVD_PROPS))
!= 0) {
free(node);
return (-1);
@@ -350,11 +350,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
- if (avl_pool == NULL) {
- (void) fprintf(stderr,
- gettext("internal error: out of memory\n"));
- exit(1);
- }
+ if (avl_pool == NULL)
+ nomem();
cb.cb_sortcol = sortcol;
cb.cb_flags = flags;
@@ -362,7 +359,7 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
cb.cb_types = types;
cb.cb_depth_limit = limit;
/*
- * If cb_proplist is provided then in the zfs_handles created we
+ * If cb_proplist is provided then in the zfs_handles created we
* retain only those properties listed in cb_proplist and sortcol.
* The rest are pruned. So, the caller should make sure that no other
* properties other than those listed in cb_proplist/sortcol are
@@ -399,11 +396,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
sizeof (cb.cb_props_table));
}
- if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
- (void) fprintf(stderr,
- gettext("internal error: out of memory\n"));
- exit(1);
- }
+ if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+ nomem();
if (argc == 0) {
/*
@@ -453,11 +447,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
/*
* Finally, clean up the AVL tree.
*/
- if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
- (void) fprintf(stderr,
- gettext("internal error: out of memory"));
- exit(1);
- }
+ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
while ((node = uu_avl_walk_next(walk)) != NULL) {
uu_avl_remove(cb.cb_avl, node);
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
index a0290775b6b6..8c6b9fdef54f 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h
@@ -42,6 +42,7 @@ typedef struct zfs_sort_column {
#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
#define ZFS_ITER_PROP_LISTSNAPS (1 << 2)
#define ZFS_ITER_DEPTH_LIMIT (1 << 3)
+#define ZFS_ITER_RECVD_PROPS (1 << 4)
int zfs_for_each(int, char **, int options, zfs_type_t,
zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
index c5e9b64a1f4b..8383dbc2dde7 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
@@ -20,8 +20,8 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
*/
#include <assert.h>
@@ -41,23 +41,33 @@
#include <zone.h>
#include <grp.h>
#include <pwd.h>
+#include <signal.h>
+#include <sys/list.h>
#include <sys/mntent.h>
#include <sys/mnttab.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <time.h>
#include <libzfs.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
#include <libuutil.h>
+#ifdef sun
+#include <aclutils.h>
+#include <directory.h>
+#endif
#include "zfs_iter.h"
#include "zfs_util.h"
+#include "zfs_comutil.h"
libzfs_handle_t *g_zfs;
static FILE *mnttab_file;
static char history_str[HIS_MAX_RECORD_LEN];
-const char *pypath = "/usr/lib/zfs/pyzfs.py";
static int zfs_do_clone(int argc, char **argv);
static int zfs_do_create(int argc, char **argv);
@@ -78,7 +88,12 @@ static int zfs_do_send(int argc, char **argv);
static int zfs_do_receive(int argc, char **argv);
static int zfs_do_promote(int argc, char **argv);
static int zfs_do_userspace(int argc, char **argv);
-static int zfs_do_python(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
static int zfs_do_jail(int argc, char **argv);
static int zfs_do_unjail(int argc, char **argv);
@@ -124,7 +139,11 @@ typedef enum {
HELP_ALLOW,
HELP_UNALLOW,
HELP_USERSPACE,
- HELP_GROUPSPACE
+ HELP_GROUPSPACE,
+ HELP_HOLD,
+ HELP_HOLDS,
+ HELP_RELEASE,
+ HELP_DIFF
} zfs_help_t;
typedef struct zfs_command {
@@ -155,7 +174,7 @@ static zfs_command_t command_table[] = {
{ "list", zfs_do_list, HELP_LIST },
{ NULL },
{ "set", zfs_do_set, HELP_SET },
- { "get", zfs_do_get, HELP_GET },
+ { "get", zfs_do_get, HELP_GET },
{ "inherit", zfs_do_inherit, HELP_INHERIT },
{ "upgrade", zfs_do_upgrade, HELP_UPGRADE },
{ "userspace", zfs_do_userspace, HELP_USERSPACE },
@@ -169,9 +188,14 @@ static zfs_command_t command_table[] = {
{ "send", zfs_do_send, HELP_SEND },
{ "receive", zfs_do_receive, HELP_RECEIVE },
{ NULL },
- { "allow", zfs_do_python, HELP_ALLOW },
+ { "allow", zfs_do_allow, HELP_ALLOW },
+ { NULL },
+ { "unallow", zfs_do_unallow, HELP_UNALLOW },
{ NULL },
- { "unallow", zfs_do_python, HELP_UNALLOW },
+ { "hold", zfs_do_hold, HELP_HOLD },
+ { "holds", zfs_do_holds, HELP_HOLDS },
+ { "release", zfs_do_release, HELP_RELEASE },
+ { "diff", zfs_do_diff, HELP_DIFF },
{ NULL },
{ "jail", zfs_do_jail, HELP_JAIL },
{ "unjail", zfs_do_unjail, HELP_UNJAIL },
@@ -194,15 +218,15 @@ get_usage(zfs_help_t idx)
"\tcreate [-ps] [-b blocksize] [-o property=value] ... "
"-V <size> <volume>\n"));
case HELP_DESTROY:
- return (gettext("\tdestroy [-rRf] "
- "<filesystem|volume|snapshot>\n"));
+ return (gettext("\tdestroy [-rRf] <filesystem|volume>\n"
+ "\tdestroy [-rRd] <snapshot>\n"));
case HELP_GET:
return (gettext("\tget [-rHp] [-d max] "
- "[-o field[,...]] [-s source[,...]]\n"
+ "[-o \"all\" | field[,...]] [-s source[,...]]\n"
"\t <\"all\" | property[,...]> "
"[filesystem|volume|snapshot] ...\n"));
case HELP_INHERIT:
- return (gettext("\tinherit [-r] <property> "
+ return (gettext("\tinherit [-rS] <property> "
"<filesystem|volume|snapshot> ...\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade [-v]\n"
@@ -222,9 +246,9 @@ get_usage(zfs_help_t idx)
case HELP_PROMOTE:
return (gettext("\tpromote <clone-filesystem>\n"));
case HELP_RECEIVE:
- return (gettext("\treceive [-vnF] <filesystem|volume|"
+ return (gettext("\treceive [-vnFu] <filesystem|volume|"
"snapshot>\n"
- "\treceive [-vnF] -d <filesystem>\n"));
+ "\treceive [-vnFu] [-d | -e] <filesystem>\n"));
case HELP_RENAME:
return (gettext("\trename <filesystem|volume|snapshot> "
"<filesystem|volume|snapshot>\n"
@@ -233,7 +257,7 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-R] [-[iI] snapshot] <snapshot>\n"));
+ return (gettext("\tsend [-RDp] [-[iI] snapshot] <snapshot>\n"));
case HELP_SET:
return (gettext("\tset <property=value> "
"<filesystem|volume|snapshot> ...\n"));
@@ -246,10 +270,11 @@ get_usage(zfs_help_t idx)
return (gettext("\tunmount [-f] "
"<-a | filesystem|mountpoint>\n"));
case HELP_UNSHARE:
- return (gettext("\tunshare [-f] "
+ return (gettext("\tunshare "
"<-a | filesystem|mountpoint>\n"));
case HELP_ALLOW:
- return (gettext("\tallow [-ldug] "
+ return (gettext("\tallow <filesystem|volume>\n"
+ "\tallow [-ldug] "
"<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
"\t <filesystem|volume>\n"
"\tallow [-ld] -e <perm|@setname>[,...] "
@@ -275,28 +300,54 @@ get_usage(zfs_help_t idx)
return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
"[-sS field] ... [-t type[,...]]\n"
"\t <filesystem|snapshot>\n"));
+ case HELP_HOLD:
+ return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+ case HELP_HOLDS:
+ return (gettext("\tholds [-r] <snapshot> ...\n"));
+ case HELP_RELEASE:
+ return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+ case HELP_DIFF:
+ return (gettext("\tdiff [-FHt] <snapshot> "
+ "[snapshot|filesystem]\n"));
}
abort();
/* NOTREACHED */
}
+void
+nomem(void)
+{
+ (void) fprintf(stderr, gettext("internal error: out of memory\n"));
+ exit(1);
+}
+
/*
* Utility function to guarantee malloc() success.
*/
+
void *
safe_malloc(size_t size)
{
void *data;
- if ((data = calloc(1, size)) == NULL) {
- (void) fprintf(stderr, "internal error: out of memory\n");
- exit(1);
- }
+ if ((data = calloc(1, size)) == NULL)
+ nomem();
return (data);
}
+static char *
+safe_strdup(char *str)
+{
+ char *dupstr = strdup(str);
+
+ if (dupstr == NULL)
+ nomem();
+
+ return (dupstr);
+}
+
/*
* Callback routine that will print out information for each of
* the properties.
@@ -435,11 +486,8 @@ parseprop(nvlist_t *props)
"specified multiple times\n"), propname);
return (-1);
}
- if (nvlist_add_string(props, propname, propval) != 0) {
- (void) fprintf(stderr, gettext("internal "
- "error: out of memory\n"));
- return (-1);
- }
+ if (nvlist_add_string(props, propname, propval) != 0)
+ nomem();
return (0);
}
@@ -464,6 +512,59 @@ parse_depth(char *opt, int *flags)
return (depth);
}
+#define PROGRESS_DELAY 2 /* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+ pt_begin = time(NULL) + PROGRESS_DELAY;
+ pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+ assert(pt_header == NULL);
+ pt_header = safe_strdup(header);
+ if (pt_shown) {
+ (void) printf("%s: ", header);
+ (void) fflush(stdout);
+ }
+}
+
+static void
+update_progress(char *update)
+{
+ if (!pt_shown && time(NULL) > pt_begin) {
+ int len = strlen(update);
+
+ (void) printf("%s: %s%*.*s", pt_header, update, len, len,
+ pt_reverse);
+ (void) fflush(stdout);
+ pt_shown = B_TRUE;
+ } else if (pt_shown) {
+ int len = strlen(update);
+
+ (void) printf("%s%*.*s", update, len, len, pt_reverse);
+ (void) fflush(stdout);
+ }
+}
+
+static void
+finish_progress(char *done)
+{
+ if (pt_shown) {
+ (void) printf("%s\n", done);
+ (void) fflush(stdout);
+ }
+ free(pt_header);
+ pt_header = NULL;
+}
/*
* zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
*
@@ -483,11 +584,8 @@ zfs_do_clone(int argc, char **argv)
int ret;
int c;
- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
- (void) fprintf(stderr, gettext("internal error: "
- "out of memory\n"));
- return (1);
- }
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
/* check options */
while ((c = getopt(argc, argv, "o:p")) != -1) {
@@ -552,8 +650,9 @@ zfs_do_clone(int argc, char **argv)
clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
if (clone != NULL) {
- if ((ret = zfs_mount(clone, NULL, 0)) == 0)
- ret = zfs_share(clone);
+ if (zfs_get_type(clone) != ZFS_TYPE_VOLUME)
+ if ((ret = zfs_mount(clone, NULL, 0)) == 0)
+ ret = zfs_share(clone);
zfs_close(clone);
}
}
@@ -599,13 +698,10 @@ zfs_do_create(int argc, char **argv)
int ret = 1;
nvlist_t *props;
uint64_t intval;
- int canmount;
+ int canmount = ZFS_CANMOUNT_OFF;
- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
- (void) fprintf(stderr, gettext("internal error: "
- "out of memory\n"));
- return (1);
- }
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
/* check options */
while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) {
@@ -620,12 +716,8 @@ zfs_do_create(int argc, char **argv)
}
if (nvlist_add_uint64(props,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE),
- intval) != 0) {
- (void) fprintf(stderr, gettext("internal "
- "error: out of memory\n"));
- goto error;
- }
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+ nomem();
volsize = intval;
break;
case 'p':
@@ -642,11 +734,8 @@ zfs_do_create(int argc, char **argv)
if (nvlist_add_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- intval) != 0) {
- (void) fprintf(stderr, gettext("internal "
- "error: out of memory\n"));
- goto error;
- }
+ intval) != 0)
+ nomem();
break;
case 'o':
if (parseprop(props))
@@ -708,15 +797,14 @@ zfs_do_create(int argc, char **argv)
resv_prop = ZFS_PROP_REFRESERVATION;
else
resv_prop = ZFS_PROP_RESERVATION;
+ volsize = zvol_volsize_to_reservation(volsize, props);
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
&strval) != 0) {
if (nvlist_add_uint64(props,
zfs_prop_to_name(resv_prop), volsize) != 0) {
- (void) fprintf(stderr, gettext("internal "
- "error: out of memory\n"));
nvlist_free(props);
- return (1);
+ nomem();
}
}
}
@@ -741,19 +829,20 @@ zfs_do_create(int argc, char **argv)
if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
goto error;
+
+ ret = 0;
/*
* if the user doesn't want the dataset automatically mounted,
* then skip the mount/share step
*/
-
- canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type))
+ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
/*
* Mount and/or share the new filesystem as appropriate. We provide a
* verbose error message to let the user know that their filesystem was
* in fact created, even if we failed to mount or share it.
*/
- ret = 0;
if (canmount == ZFS_CANMOUNT_ON) {
if (zfs_mount(zhp, NULL, 0) != 0) {
(void) fprintf(stderr, gettext("filesystem "
@@ -778,11 +867,13 @@ badusage:
}
/*
- * zfs destroy [-rf] <fs, snap, vol>
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
*
- * -r Recursively destroy all children
- * -R Recursively destroy all dependents, including clones
- * -f Force unmounting of any dependents
+ * -r Recursively destroy all children
+ * -R Recursively destroy all dependents, including clones
+ * -f Force unmounting of any dependents
+ * -d If we can't destroy now, mark for deferred destruction
*
* Destroys the given dataset. By default, it will unmount any filesystems,
* and refuse to destroy a dataset that has any dependents. A dependent can
@@ -798,6 +889,7 @@ typedef struct destroy_cbdata {
boolean_t cb_closezhp;
zfs_handle_t *cb_target;
char *cb_snapname;
+ boolean_t cb_defer_destroy;
} destroy_cbdata_t;
/*
@@ -866,7 +958,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
/*
* Ignore pools (which we've already flagged as an error before getting
- * here.
+ * here).
*/
if (strchr(zfs_get_name(zhp), '/') == NULL &&
zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
@@ -878,7 +970,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
* Bail out on the first error.
*/
if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
- zfs_destroy(zhp) != 0) {
+ zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) {
zfs_close(zhp);
return (-1);
}
@@ -930,10 +1022,15 @@ zfs_do_destroy(int argc, char **argv)
int c;
zfs_handle_t *zhp;
char *cp;
+ zfs_type_t type = ZFS_TYPE_DATASET;
/* check options */
- while ((c = getopt(argc, argv, "frR")) != -1) {
+ while ((c = getopt(argc, argv, "dfrR")) != -1) {
switch (c) {
+ case 'd':
+ cb.cb_defer_destroy = B_TRUE;
+ type = ZFS_TYPE_SNAPSHOT;
+ break;
case 'f':
cb.cb_force = 1;
break;
@@ -979,14 +1076,22 @@ zfs_do_destroy(int argc, char **argv)
cp++;
if (cb.cb_doclones) {
+ boolean_t defer = cb.cb_defer_destroy;
+
+ /*
+ * Temporarily ignore the defer_destroy setting since
+ * it's not supported for clones.
+ */
+ cb.cb_defer_destroy = B_FALSE;
cb.cb_snapname = cp;
if (destroy_snap_clones(zhp, &cb) != 0) {
zfs_close(zhp);
return (1);
}
+ cb.cb_defer_destroy = defer;
}
- ret = zfs_destroy_snaps(zhp, cp);
+ ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
zfs_close(zhp);
if (ret) {
(void) fprintf(stderr,
@@ -995,9 +1100,8 @@ zfs_do_destroy(int argc, char **argv)
return (ret != 0);
}
-
/* Open the given dataset */
- if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
return (1);
cb.cb_target = zhp;
@@ -1023,15 +1127,15 @@ zfs_do_destroy(int argc, char **argv)
* Check for any dependents and/or clones.
*/
cb.cb_first = B_TRUE;
- if (!cb.cb_doclones &&
+ if (!cb.cb_doclones && !cb.cb_defer_destroy &&
zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
&cb) != 0) {
zfs_close(zhp);
return (1);
}
- if (cb.cb_error ||
- zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
+ if (cb.cb_error || (!cb.cb_defer_destroy &&
+ (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) {
zfs_close(zhp);
return (1);
}
@@ -1044,22 +1148,35 @@ zfs_do_destroy(int argc, char **argv)
if (destroy_callback(zhp, &cb) != 0)
return (1);
-
return (0);
}
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+ int i;
+ zfs_get_column_t col;
+
+ for (i = 0; i < ZFS_GET_NCOLS &&
+ (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+ if (col == GET_COL_RECVD)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
/*
- * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
- * < all | property[,property]... > < fs | snap | vol > ...
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ * < all | property[,property]... > < fs | snap | vol > ...
*
* -r recurse over any child datasets
* -H scripted mode. Headers are stripped, and fields are separated
* by tabs instead of spaces.
- * -o Set of fields to display. One of "name,property,value,source".
- * Default is all four.
+ * -o Set of fields to display. One of "name,property,value,
+ * received,source". Default is "name,property,value,source".
+ * "all" is an alias for all five.
* -s Set of sources to allow. One of
- * "local,default,inherited,temporary,none". Default is all
- * five.
+ * "local,default,inherited,received,temporary,none". Default is
+ * all six.
* -p Display values in parsable (literal) format.
*
* Prints properties for the given datasets. The user can control which
@@ -1073,16 +1190,19 @@ static int
get_callback(zfs_handle_t *zhp, void *data)
{
char buf[ZFS_MAXPROPLEN];
+ char rbuf[ZFS_MAXPROPLEN];
zprop_source_t sourcetype;
char source[ZFS_MAXNAMELEN];
zprop_get_cbdata_t *cbp = data;
- nvlist_t *userprop = zfs_get_user_props(zhp);
+ nvlist_t *user_props = zfs_get_user_props(zhp);
zprop_list_t *pl = cbp->cb_proplist;
nvlist_t *propval;
char *strval;
char *sourceval;
+ boolean_t received = is_recvd_column(cbp);
for (; pl != NULL; pl = pl->pl_next) {
+ char *recvdval = NULL;
/*
* Skip the special fake placeholder. This will also skip over
* the name property when 'all' is specified.
@@ -1109,9 +1229,14 @@ get_callback(zfs_handle_t *zhp, void *data)
(void) strlcpy(buf, "-", sizeof (buf));
}
+ if (received && (zfs_prop_get_recvd(zhp,
+ zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+ cbp->cb_literal) == 0))
+ recvdval = rbuf;
+
zprop_print_one_property(zfs_get_name(zhp), cbp,
zfs_prop_to_name(pl->pl_prop),
- buf, sourcetype, source);
+ buf, sourcetype, source, recvdval);
} else if (zfs_prop_userquota(pl->pl_user_prop)) {
sourcetype = ZPROP_SRC_LOCAL;
@@ -1122,9 +1247,9 @@ get_callback(zfs_handle_t *zhp, void *data)
}
zprop_print_one_property(zfs_get_name(zhp), cbp,
- pl->pl_user_prop, buf, sourcetype, source);
+ pl->pl_user_prop, buf, sourcetype, source, NULL);
} else {
- if (nvlist_lookup_nvlist(userprop,
+ if (nvlist_lookup_nvlist(user_props,
pl->pl_user_prop, &propval) != 0) {
if (pl->pl_all)
continue;
@@ -1139,6 +1264,9 @@ get_callback(zfs_handle_t *zhp, void *data)
if (strcmp(sourceval,
zfs_get_name(zhp)) == 0) {
sourcetype = ZPROP_SRC_LOCAL;
+ } else if (strcmp(sourceval,
+ ZPROP_SOURCE_VAL_RECVD) == 0) {
+ sourcetype = ZPROP_SRC_RECEIVED;
} else {
sourcetype = ZPROP_SRC_INHERITED;
(void) strlcpy(source,
@@ -1146,9 +1274,14 @@ get_callback(zfs_handle_t *zhp, void *data)
}
}
+ if (received && (zfs_prop_get_recvd(zhp,
+ pl->pl_user_prop, rbuf, sizeof (rbuf),
+ cbp->cb_literal) == 0))
+ recvdval = rbuf;
+
zprop_print_one_property(zfs_get_name(zhp), cbp,
pl->pl_user_prop, strval, sourcetype,
- source);
+ source, recvdval);
}
}
@@ -1204,10 +1337,10 @@ zfs_do_get(int argc, char **argv)
i = 0;
while (*optarg != '\0') {
static char *col_subopts[] =
- { "name", "property", "value", "source",
- NULL };
+ { "name", "property", "value", "received",
+ "source", "all", NULL };
- if (i == 4) {
+ if (i == ZFS_GET_NCOLS) {
(void) fprintf(stderr, gettext("too "
"many fields given to -o "
"option\n"));
@@ -1226,8 +1359,28 @@ zfs_do_get(int argc, char **argv)
cb.cb_columns[i++] = GET_COL_VALUE;
break;
case 3:
+ cb.cb_columns[i++] = GET_COL_RECVD;
+ flags |= ZFS_ITER_RECVD_PROPS;
+ break;
+ case 4:
cb.cb_columns[i++] = GET_COL_SOURCE;
break;
+ case 5:
+ if (i > 0) {
+ (void) fprintf(stderr,
+ gettext("\"all\" conflicts "
+ "with specific fields "
+ "given to -o option\n"));
+ usage(B_FALSE);
+ }
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_RECVD;
+ cb.cb_columns[4] = GET_COL_SOURCE;
+ flags |= ZFS_ITER_RECVD_PROPS;
+ i = ZFS_GET_NCOLS;
+ break;
default:
(void) fprintf(stderr,
gettext("invalid column name "
@@ -1242,7 +1395,8 @@ zfs_do_get(int argc, char **argv)
while (*optarg != '\0') {
static char *source_subopts[] = {
"local", "default", "inherited",
- "temporary", "none", NULL };
+ "received", "temporary", "none",
+ NULL };
switch (getsubopt(&optarg, source_subopts,
&value)) {
@@ -1256,9 +1410,12 @@ zfs_do_get(int argc, char **argv)
cb.cb_sources |= ZPROP_SRC_INHERITED;
break;
case 3:
- cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+ cb.cb_sources |= ZPROP_SRC_RECEIVED;
break;
case 4:
+ cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+ break;
+ case 5:
cb.cb_sources |= ZPROP_SRC_NONE;
break;
default:
@@ -1325,9 +1482,10 @@ zfs_do_get(int argc, char **argv)
}
/*
- * inherit [-r] <property> <fs|vol> ...
+ * inherit [-rS] <property> <fs|vol> ...
*
- * -r Recurse over all children
+ * -r Recurse over all children
+ * -S Revert to received value, if any
*
* For each dataset specified on the command line, inherit the given property
* from its parent. Inheriting a property at the pool level will cause it to
@@ -1336,11 +1494,16 @@ zfs_do_get(int argc, char **argv)
* local modifications for each dataset.
*/
+typedef struct inherit_cbdata {
+ const char *cb_propname;
+ boolean_t cb_received;
+} inherit_cbdata_t;
+
static int
inherit_recurse_cb(zfs_handle_t *zhp, void *data)
{
- char *propname = data;
- zfs_prop_t prop = zfs_name_to_prop(propname);
+ inherit_cbdata_t *cb = data;
+ zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
/*
* If we're doing it recursively, then ignore properties that
@@ -1350,15 +1513,15 @@ inherit_recurse_cb(zfs_handle_t *zhp, void *data)
!zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
return (0);
- return (zfs_prop_inherit(zhp, propname) != 0);
+ return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
}
static int
inherit_cb(zfs_handle_t *zhp, void *data)
{
- char *propname = data;
+ inherit_cbdata_t *cb = data;
- return (zfs_prop_inherit(zhp, propname) != 0);
+ return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
}
static int
@@ -1366,16 +1529,21 @@ zfs_do_inherit(int argc, char **argv)
{
int c;
zfs_prop_t prop;
+ inherit_cbdata_t cb = { 0 };
char *propname;
int ret;
int flags = 0;
+ boolean_t received = B_FALSE;
/* check options */
- while ((c = getopt(argc, argv, "r")) != -1) {
+ while ((c = getopt(argc, argv, "rS")) != -1) {
switch (c) {
case 'r':
flags |= ZFS_ITER_RECURSE;
break;
+ case 'S':
+ received = B_TRUE;
+ break;
case '?':
default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -1408,7 +1576,7 @@ zfs_do_inherit(int argc, char **argv)
propname);
return (1);
}
- if (!zfs_prop_inheritable(prop)) {
+ if (!zfs_prop_inheritable(prop) && !received) {
(void) fprintf(stderr, gettext("'%s' property cannot "
"be inherited\n"), propname);
if (prop == ZFS_PROP_QUOTA ||
@@ -1419,18 +1587,27 @@ zfs_do_inherit(int argc, char **argv)
"%s=none' to clear\n"), propname);
return (1);
}
+ if (received && (prop == ZFS_PROP_VOLSIZE ||
+ prop == ZFS_PROP_VERSION)) {
+ (void) fprintf(stderr, gettext("'%s' property cannot "
+ "be reverted to a received value\n"), propname);
+ return (1);
+ }
} else if (!zfs_prop_user(propname)) {
(void) fprintf(stderr, gettext("invalid property '%s'\n"),
propname);
usage(B_FALSE);
}
+ cb.cb_propname = propname;
+ cb.cb_received = received;
+
if (flags & ZFS_ITER_RECURSE) {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
- NULL, NULL, 0, inherit_recurse_cb, propname);
+ NULL, NULL, 0, inherit_recurse_cb, &cb);
} else {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
- NULL, NULL, 0, inherit_cb, propname);
+ NULL, NULL, 0, inherit_cb, &cb);
}
return (ret);
@@ -1499,31 +1676,25 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data)
{
upgrade_cbdata_t *cb = data;
int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
- int i;
- static struct { int zplver; int spaver; } table[] = {
- {ZPL_VERSION_FUID, SPA_VERSION_FUID},
- {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
- {0, 0}
- };
+ int needed_spa_version;
+ int spa_version;
+ if (zfs_spa_version(zhp, &spa_version) < 0)
+ return (-1);
- for (i = 0; table[i].zplver; i++) {
- if (cb->cb_version >= table[i].zplver) {
- int spa_version;
+ needed_spa_version = zfs_spa_version_map(cb->cb_version);
- if (zfs_spa_version(zhp, &spa_version) < 0)
- return (-1);
+ if (needed_spa_version < 0)
+ return (-1);
- if (spa_version < table[i].spaver) {
- /* can't upgrade */
- (void) printf(gettext("%s: can not be "
- "upgraded; the pool version needs to first "
- "be upgraded\nto version %d\n\n"),
- zfs_get_name(zhp), table[i].spaver);
- cb->cb_numfailed++;
- return (0);
- }
- }
+ if (spa_version < needed_spa_version) {
+ /* can't upgrade */
+ (void) printf(gettext("%s: can not be "
+ "upgraded; the pool version needs to first "
+ "be upgraded\nto version %d\n\n"),
+ zfs_get_name(zhp), needed_spa_version);
+ cb->cb_numfailed++;
+ return (0);
}
/* upgrade */
@@ -1622,14 +1793,13 @@ zfs_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
(void) printf(gettext(" 2 Enhanced directory entries\n"));
(void) printf(gettext(" 3 Case insensitive and File system "
- "unique identifer (FUID)\n"));
+ "unique identifier (FUID)\n"));
(void) printf(gettext(" 4 userquota, groupquota "
"properties\n"));
+ (void) printf(gettext(" 5 System attributes\n"));
(void) printf(gettext("\nFor more information on a particular "
- "version, including supported releases, see:\n\n"));
- (void) printf("http://www.opensolaris.org/os/community/zfs/"
- "version/zpl/N\n\n");
- (void) printf(gettext("Where 'N' is the version number.\n"));
+ "version, including supported releases,\n"));
+ (void) printf("see the ZFS Administration Guide.\n\n");
ret = 0;
} else if (argc || all) {
/* Upgrade filesystems */
@@ -1672,82 +1842,730 @@ zfs_do_upgrade(int argc, char **argv)
return (ret);
}
+#define USTYPE_USR_BIT (0)
+#define USTYPE_GRP_BIT (1)
+#define USTYPE_PSX_BIT (2)
+#define USTYPE_SMB_BIT (3)
+
+#define USTYPE_USR (1 << USTYPE_USR_BIT)
+#define USTYPE_GRP (1 << USTYPE_GRP_BIT)
+
+#define USTYPE_PSX (1 << USTYPE_PSX_BIT)
+#define USTYPE_SMB (1 << USTYPE_SMB_BIT)
+
+#define USTYPE_PSX_USR (USTYPE_PSX | USTYPE_USR)
+#define USTYPE_SMB_USR (USTYPE_SMB | USTYPE_USR)
+#define USTYPE_PSX_GRP (USTYPE_PSX | USTYPE_GRP)
+#define USTYPE_SMB_GRP (USTYPE_SMB | USTYPE_GRP)
+#define USTYPE_ALL (USTYPE_PSX_USR | USTYPE_SMB_USR \
+ | USTYPE_PSX_GRP | USTYPE_SMB_GRP)
+
+
+#define USPROP_USED_BIT (0)
+#define USPROP_QUOTA_BIT (1)
+
+#define USPROP_USED (1 << USPROP_USED_BIT)
+#define USPROP_QUOTA (1 << USPROP_QUOTA_BIT)
+
+typedef struct us_node {
+ nvlist_t *usn_nvl;
+ uu_avl_node_t usn_avlnode;
+ uu_list_node_t usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+ nvlist_t **cb_nvlp;
+ uu_avl_pool_t *cb_avl_pool;
+ uu_avl_t *cb_avl;
+ boolean_t cb_numname;
+ boolean_t cb_nicenum;
+ boolean_t cb_sid2posix;
+ zfs_userquota_prop_t cb_prop;
+ zfs_sort_column_t *cb_sortcol;
+ size_t cb_max_typelen;
+ size_t cb_max_namelen;
+ size_t cb_max_usedlen;
+ size_t cb_max_quotalen;
+} us_cbdata_t;
+
+typedef struct {
+ zfs_sort_column_t *si_sortcol;
+ boolean_t si_num_name;
+ boolean_t si_parsable;
+} us_sort_info_t;
+
+static int
+us_compare(const void *larg, const void *rarg, void *unused)
+{
+ const us_node_t *l = larg;
+ const us_node_t *r = rarg;
+ int rc = 0;
+ us_sort_info_t *si = (us_sort_info_t *)unused;
+ zfs_sort_column_t *sortcol = si->si_sortcol;
+ boolean_t num_name = si->si_num_name;
+ nvlist_t *lnvl = l->usn_nvl;
+ nvlist_t *rnvl = r->usn_nvl;
+
+ for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+ char *lvstr = "";
+ char *rvstr = "";
+ uint32_t lv32 = 0;
+ uint32_t rv32 = 0;
+ uint64_t lv64 = 0;
+ uint64_t rv64 = 0;
+ zfs_prop_t prop = sortcol->sc_prop;
+ const char *propname = NULL;
+ boolean_t reverse = sortcol->sc_reverse;
+
+ switch (prop) {
+ case ZFS_PROP_TYPE:
+ propname = "type";
+ (void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+ (void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+ if (rv32 != lv32)
+ rc = (rv32 > lv32) ? 1 : -1;
+ break;
+ case ZFS_PROP_NAME:
+ propname = "name";
+ if (num_name) {
+ (void) nvlist_lookup_uint32(lnvl, propname,
+ &lv32);
+ (void) nvlist_lookup_uint32(rnvl, propname,
+ &rv32);
+ if (rv32 != lv32)
+ rc = (rv32 > lv32) ? 1 : -1;
+ } else {
+ (void) nvlist_lookup_string(lnvl, propname,
+ &lvstr);
+ (void) nvlist_lookup_string(rnvl, propname,
+ &rvstr);
+ rc = strcmp(lvstr, rvstr);
+ }
+ break;
+
+ case ZFS_PROP_USED:
+ case ZFS_PROP_QUOTA:
+ if (ZFS_PROP_USED == prop)
+ propname = "used";
+ else
+ propname = "quota";
+ (void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+ (void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+ if (rv64 != lv64)
+ rc = (rv64 > lv64) ? 1 : -1;
+ }
+
+ if (rc)
+ if (rc < 0)
+ return (reverse ? 1 : -1);
+ else
+ return (reverse ? -1 : 1);
+ }
+
+ return (rc);
+}
+
+static inline const char *
+us_type2str(unsigned field_type)
+{
+ switch (field_type) {
+ case USTYPE_PSX_USR:
+ return ("POSIX User");
+ case USTYPE_PSX_GRP:
+ return ("POSIX Group");
+ case USTYPE_SMB_USR:
+ return ("SMB User");
+ case USTYPE_SMB_GRP:
+ return ("SMB Group");
+ default:
+ return ("Undefined");
+ }
+}
+
/*
* zfs userspace
*/
static int
userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
{
- zfs_userquota_prop_t *typep = arg;
- zfs_userquota_prop_t p = *typep;
+ us_cbdata_t *cb = (us_cbdata_t *)arg;
+ zfs_userquota_prop_t prop = cb->cb_prop;
char *name = NULL;
- char *ug, *propname;
+ char *propname;
char namebuf[32];
char sizebuf[32];
+ us_node_t *node;
+ uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+ uu_avl_t *avl = cb->cb_avl;
+ uu_avl_index_t idx;
+ nvlist_t *props;
+ us_node_t *n;
+ zfs_sort_column_t *sortcol = cb->cb_sortcol;
+ unsigned type;
+ const char *typestr;
+ size_t namelen;
+ size_t typelen;
+ size_t sizelen;
+ us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
if (domain == NULL || domain[0] == '\0') {
- if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
+ /* POSIX */
+ if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+ type = USTYPE_PSX_GRP;
struct group *g = getgrgid(rid);
if (g)
name = g->gr_name;
} else {
+ type = USTYPE_PSX_USR;
struct passwd *p = getpwuid(rid);
if (p)
name = p->pw_name;
}
+ } else {
+ char sid[ZFS_MAXNAMELEN+32];
+ uid_t id;
+ uint64_t classes;
+#ifdef sun
+ int err;
+ directory_error_t e;
+#endif
+
+ (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+ /* SMB */
+ if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+ type = USTYPE_SMB_GRP;
+#ifdef sun
+ err = sid_to_id(sid, B_FALSE, &id);
+#endif
+ } else {
+ type = USTYPE_SMB_USR;
+#ifdef sun
+ err = sid_to_id(sid, B_TRUE, &id);
+#endif
+ }
+
+#ifdef sun
+ if (err == 0) {
+ rid = id;
+
+ e = directory_name_from_sid(NULL, sid, &name, &classes);
+ if (e != NULL) {
+ directory_error_free(e);
+ return (NULL);
+ }
+
+ if (name == NULL)
+ name = sid;
+ }
+#endif
}
- if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
- ug = "group";
- else
- ug = "user";
+/*
+ * if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA)
+ * ug = "group";
+ * else
+ * ug = "user";
+ */
- if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
+ if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED)
propname = "used";
else
propname = "quota";
- if (name == NULL) {
- (void) snprintf(namebuf, sizeof (namebuf),
- "%llu", (longlong_t)rid);
+ (void) snprintf(namebuf, sizeof (namebuf), "%u", rid);
+ if (name == NULL)
name = namebuf;
+
+ if (cb->cb_nicenum)
+ zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+ else
+ (void) sprintf(sizebuf, "%llu", space);
+
+ node = safe_malloc(sizeof (us_node_t));
+ uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
+ free(node);
+ return (-1);
}
- zfs_nicenum(space, sizebuf, sizeof (sizebuf));
- (void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
- domain[0] ? '-' : ' ', name, sizebuf);
+ if (nvlist_add_uint32(props, "type", type) != 0)
+ nomem();
+
+ if (cb->cb_numname) {
+ if (nvlist_add_uint32(props, "name", rid) != 0)
+ nomem();
+ namelen = strlen(namebuf);
+ } else {
+ if (nvlist_add_string(props, "name", name) != 0)
+ nomem();
+ namelen = strlen(name);
+ }
+
+ typestr = us_type2str(type);
+ typelen = strlen(gettext(typestr));
+ if (typelen > cb->cb_max_typelen)
+ cb->cb_max_typelen = typelen;
+
+ if (namelen > cb->cb_max_namelen)
+ cb->cb_max_namelen = namelen;
+
+ sizelen = strlen(sizebuf);
+ if (0 == strcmp(propname, "used")) {
+ if (sizelen > cb->cb_max_usedlen)
+ cb->cb_max_usedlen = sizelen;
+ } else {
+ if (sizelen > cb->cb_max_quotalen)
+ cb->cb_max_quotalen = sizelen;
+ }
+
+ node->usn_nvl = props;
+
+ n = uu_avl_find(avl, node, &sortinfo, &idx);
+ if (n == NULL)
+ uu_avl_insert(avl, node, idx);
+ else {
+ nvlist_free(props);
+ free(node);
+ node = n;
+ props = node->usn_nvl;
+ }
+
+ if (nvlist_add_uint64(props, propname, space) != 0)
+ nomem();
return (0);
}
+static inline boolean_t
+usprop_check(zfs_userquota_prop_t p, unsigned types, unsigned props)
+{
+ unsigned type;
+ unsigned prop;
+
+ switch (p) {
+ case ZFS_PROP_USERUSED:
+ type = USTYPE_USR;
+ prop = USPROP_USED;
+ break;
+ case ZFS_PROP_USERQUOTA:
+ type = USTYPE_USR;
+ prop = USPROP_QUOTA;
+ break;
+ case ZFS_PROP_GROUPUSED:
+ type = USTYPE_GRP;
+ prop = USPROP_USED;
+ break;
+ case ZFS_PROP_GROUPQUOTA:
+ type = USTYPE_GRP;
+ prop = USPROP_QUOTA;
+ break;
+ default: /* ALL */
+ return (B_TRUE);
+ };
+
+ return (type & types && prop & props);
+}
+
+#define USFIELD_TYPE (1 << 0)
+#define USFIELD_NAME (1 << 1)
+#define USFIELD_USED (1 << 2)
+#define USFIELD_QUOTA (1 << 3)
+#define USFIELD_ALL (USFIELD_TYPE | USFIELD_NAME | USFIELD_USED | USFIELD_QUOTA)
+
+static int
+parsefields(unsigned *fieldsp, char **names, unsigned *bits, size_t len)
+{
+ char *field = optarg;
+ char *delim;
+
+ do {
+ int i;
+ boolean_t found = B_FALSE;
+ delim = strchr(field, ',');
+ if (delim != NULL)
+ *delim = '\0';
+
+ for (i = 0; i < len; i++)
+ if (0 == strcmp(field, names[i])) {
+ found = B_TRUE;
+ *fieldsp |= bits[i];
+ break;
+ }
+
+ if (!found) {
+ (void) fprintf(stderr, gettext("invalid type '%s'"
+ "for -t option\n"), field);
+ return (-1);
+ }
+
+ field = delim + 1;
+ } while (delim);
+
+ return (0);
+}
+
+
+static char *type_names[] = { "posixuser", "smbuser", "posixgroup", "smbgroup",
+ "all" };
+static unsigned type_bits[] = {
+ USTYPE_PSX_USR,
+ USTYPE_SMB_USR,
+ USTYPE_PSX_GRP,
+ USTYPE_SMB_GRP,
+ USTYPE_ALL
+};
+
+static char *us_field_names[] = { "type", "name", "used", "quota" };
+static unsigned us_field_bits[] = {
+ USFIELD_TYPE,
+ USFIELD_NAME,
+ USFIELD_USED,
+ USFIELD_QUOTA
+};
+
+static void
+print_us_node(boolean_t scripted, boolean_t parseable, unsigned fields,
+ size_t type_width, size_t name_width, size_t used_width,
+ size_t quota_width, us_node_t *node)
+{
+ nvlist_t *nvl = node->usn_nvl;
+ nvpair_t *nvp = NULL;
+ char valstr[ZFS_MAXNAMELEN];
+ boolean_t first = B_TRUE;
+ boolean_t quota_found = B_FALSE;
+
+ if (fields & USFIELD_QUOTA && !nvlist_exists(nvl, "quota"))
+ if (nvlist_add_string(nvl, "quota", "none") != 0)
+ nomem();
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ char *pname = nvpair_name(nvp);
+ data_type_t type = nvpair_type(nvp);
+ uint32_t val32 = 0;
+ uint64_t val64 = 0;
+ char *strval = NULL;
+ unsigned field = 0;
+ unsigned width = 0;
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (0 == strcmp(pname, us_field_names[i])) {
+ field = us_field_bits[i];
+ break;
+ }
+ }
+
+ if (!(field & fields))
+ continue;
+
+ switch (type) {
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &val32);
+ break;
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &val64);
+ break;
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &strval);
+ break;
+ default:
+ (void) fprintf(stderr, "Invalid data type\n");
+ }
+
+ if (!first)
+ if (scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+
+ switch (field) {
+ case USFIELD_TYPE:
+ strval = (char *)us_type2str(val32);
+ width = type_width;
+ break;
+ case USFIELD_NAME:
+ if (type == DATA_TYPE_UINT64) {
+ (void) sprintf(valstr, "%llu", val64);
+ strval = valstr;
+ }
+ width = name_width;
+ break;
+ case USFIELD_USED:
+ case USFIELD_QUOTA:
+ if (type == DATA_TYPE_UINT64) {
+ (void) nvpair_value_uint64(nvp, &val64);
+ if (parseable)
+ (void) sprintf(valstr, "%llu", val64);
+ else
+ zfs_nicenum(val64, valstr,
+ sizeof (valstr));
+ strval = valstr;
+ }
+
+ if (field == USFIELD_USED)
+ width = used_width;
+ else {
+ quota_found = B_FALSE;
+ width = quota_width;
+ }
+
+ break;
+ }
+
+ if (field == USFIELD_QUOTA && !quota_found)
+ (void) printf("%*s", width, strval);
+ else {
+ if (type == DATA_TYPE_STRING)
+ (void) printf("%-*s", width, strval);
+ else
+ (void) printf("%*s", width, strval);
+ }
+
+ first = B_FALSE;
+
+ }
+
+ (void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, unsigned fields,
+ unsigned type_width, unsigned name_width, unsigned used_width,
+ unsigned quota_width, boolean_t rmnode, uu_avl_t *avl)
+{
+ static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
+ us_node_t *node;
+ const char *col;
+ int i;
+ size_t width[4] = { type_width, name_width, used_width, quota_width };
+
+ if (!scripted) {
+ boolean_t first = B_TRUE;
+ for (i = 0; i < 4; i++) {
+ unsigned field = us_field_bits[i];
+ if (!(field & fields))
+ continue;
+
+ col = gettext(us_field_hdr[i]);
+ if (field == USFIELD_TYPE || field == USFIELD_NAME)
+ (void) printf(first?"%-*s":" %-*s", width[i],
+ col);
+ else
+ (void) printf(first?"%*s":" %*s", width[i],
+ col);
+ first = B_FALSE;
+ }
+ (void) printf("\n");
+ }
+
+ for (node = uu_avl_first(avl); node != NULL;
+ node = uu_avl_next(avl, node)) {
+ print_us_node(scripted, parsable, fields, type_width,
+ name_width, used_width, used_width, node);
+ if (rmnode)
+ nvlist_free(node->usn_nvl);
+ }
+}
+
static int
zfs_do_userspace(int argc, char **argv)
{
zfs_handle_t *zhp;
zfs_userquota_prop_t p;
+
+ uu_avl_pool_t *avl_pool;
+ uu_avl_t *avl_tree;
+ uu_avl_walk_t *walk;
+
+ char *cmd;
+ boolean_t scripted = B_FALSE;
+ boolean_t prtnum = B_FALSE;
+ boolean_t parseable = B_FALSE;
+ boolean_t sid2posix = B_FALSE;
int error;
+ int c;
+ zfs_sort_column_t *default_sortcol = NULL;
+ zfs_sort_column_t *sortcol = NULL;
+ unsigned types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+ unsigned fields = 0;
+ unsigned props = USPROP_USED | USPROP_QUOTA;
+ us_cbdata_t cb;
+ us_node_t *node;
+ boolean_t resort_avl = B_FALSE;
+
+ if (argc < 2)
+ usage(B_FALSE);
- /*
- * Try the python version. If the execv fails, we'll continue
- * and do a simplistic implementation.
- */
- (void) execv(pypath, argv-1);
+ cmd = argv[0];
+ if (0 == strcmp(cmd, "groupspace"))
+ /* toggle default group types */
+ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+ switch (c) {
+ case 'n':
+ prtnum = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 'p':
+ parseable = B_TRUE;
+ break;
+ case 'o':
+ if (parsefields(&fields, us_field_names, us_field_bits,
+ 4) != 0)
+ return (1);
+ break;
+ case 's':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_FALSE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 'S':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_TRUE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 't':
+ if (parsefields(&types, type_names, type_bits, 5))
+ return (1);
+ break;
+ case 'i':
+ sid2posix = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
- (void) fprintf(stderr, "internal error: %s not found\n"
- "falling back on built-in implementation, "
- "some features will not work\n", pypath);
- (void) fprintf(stderr, " install sysutils/py-zfs port to correct this\n");
+ /* ok, now we have sorted by default colums (type,name) avl tree */
+ if (sortcol) {
+ zfs_sort_column_t *sc;
+ for (sc = sortcol; sc; sc = sc->sc_next) {
+ if (sc->sc_prop == ZFS_PROP_QUOTA) {
+ resort_avl = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (!fields)
+ fields = USFIELD_ALL;
if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
return (1);
- (void) printf("PROP TYPE NAME VALUE\n");
+ if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+ offsetof(us_node_t, usn_avlnode),
+ us_compare, UU_DEFAULT)) == NULL)
+ nomem();
+ if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+ nomem();
+
+ if (sortcol && !resort_avl)
+ cb.cb_sortcol = sortcol;
+ else {
+ (void) zfs_add_sort_column(&default_sortcol, "type", B_FALSE);
+ (void) zfs_add_sort_column(&default_sortcol, "name", B_FALSE);
+ cb.cb_sortcol = default_sortcol;
+ }
+ cb.cb_numname = prtnum;
+ cb.cb_nicenum = !parseable;
+ cb.cb_avl_pool = avl_pool;
+ cb.cb_avl = avl_tree;
+ cb.cb_sid2posix = sid2posix;
+ cb.cb_max_typelen = strlen(gettext("TYPE"));
+ cb.cb_max_namelen = strlen(gettext("NAME"));
+ cb.cb_max_usedlen = strlen(gettext("USED"));
+ cb.cb_max_quotalen = strlen(gettext("QUOTA"));
for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
- error = zfs_userspace(zhp, p, userspace_cb, &p);
+ if (!usprop_check(p, types, props))
+ continue;
+
+ cb.cb_prop = p;
+ error = zfs_userspace(zhp, p, userspace_cb, &cb);
+
if (error)
break;
}
+
+ if (resort_avl) {
+ us_node_t *node;
+ us_node_t *rmnode;
+ uu_list_pool_t *listpool;
+ uu_list_t *list;
+ uu_avl_index_t idx = 0;
+ uu_list_index_t idx2 = 0;
+ listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+ offsetof(us_node_t, usn_listnode), NULL,
+ UU_DEFAULT);
+ list = uu_list_create(listpool, NULL, UU_DEFAULT);
+
+ node = uu_avl_first(avl_tree);
+ uu_list_node_init(node, &node->usn_listnode, listpool);
+ while (node != NULL) {
+ rmnode = node;
+ node = uu_avl_next(avl_tree, node);
+ uu_avl_remove(avl_tree, rmnode);
+ if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) {
+ uu_list_insert(list, rmnode, idx2);
+ }
+ }
+
+ for (node = uu_list_first(list); node != NULL;
+ node = uu_list_next(list, node)) {
+ us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+ if (uu_avl_find(avl_tree, node, &sortinfo, &idx) ==
+ NULL)
+ uu_avl_insert(avl_tree, node, idx);
+ }
+
+ uu_list_destroy(list);
+ }
+
+ /* print & free node`s nvlist memory */
+ print_us(scripted, parseable, fields, cb.cb_max_typelen,
+ cb.cb_max_namelen, cb.cb_max_usedlen,
+ cb.cb_max_quotalen, B_TRUE, cb.cb_avl);
+
+ if (sortcol)
+ zfs_free_sort_columns(sortcol);
+ zfs_free_sort_columns(default_sortcol);
+
+ /*
+ * Finally, clean up the AVL tree.
+ */
+ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(cb.cb_avl, node);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(avl_tree);
+ uu_avl_pool_destroy(avl_pool);
+
return (error);
}
@@ -1756,11 +2574,11 @@ zfs_do_userspace(int argc, char **argv)
* [-s property [-s property]...] [-S property [-S property]...]
* <dataset> ...
*
- * -r Recurse over all children
- * -d Limit recursion by depth.
- * -H Scripted mode; elide headers and separate columns by tabs
- * -o Control which fields to display.
- * -t Control which object types to display.
+ * -r Recurse over all children
+ * -d Limit recursion by depth.
+ * -H Scripted mode; elide headers and separate columns by tabs
+ * -o Control which fields to display.
+ * -t Control which object types to display.
* -s Specify sort columns, descending order.
* -S Specify sort columns, ascending order.
*
@@ -2157,9 +2975,9 @@ zfs_do_promote(int argc, char **argv)
/*
* zfs rollback [-rRf] <snapshot>
*
- * -r Delete any intervening snapshots before doing rollback
- * -R Delete any snapshots and their clones
- * -f ignored for backwards compatability
+ * -r Delete any intervening snapshots before doing rollback
+ * -R Delete any snapshots and their clones
+ * -f ignored for backwards compatability
*
* Given a filesystem, rollback to a specific snapshot, discarding any changes
* since then and making it the active dataset. If more recent snapshots exist,
@@ -2420,11 +3238,8 @@ zfs_do_snapshot(int argc, char **argv)
char c;
nvlist_t *props;
- if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
- (void) fprintf(stderr, gettext("internal error: "
- "out of memory\n"));
- return (1);
- }
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
/* check options */
while ((c = getopt(argc, argv, "ro:")) != -1) {
@@ -2469,8 +3284,8 @@ usage:
}
/*
- * zfs send [-v] -R [-i|-I <@snap>] <fs@snap>
- * zfs send [-v] [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] -R [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] [-i|-I <@snap>] <fs@snap>
*
* Send a backup stream to stdout.
*/
@@ -2481,14 +3296,13 @@ zfs_do_send(int argc, char **argv)
char *toname = NULL;
char *cp;
zfs_handle_t *zhp;
- boolean_t doall = B_FALSE;
- boolean_t replicate = B_FALSE;
- boolean_t fromorigin = B_FALSE;
- boolean_t verbose = B_FALSE;
+ sendflags_t flags = { 0 };
int c, err;
+ nvlist_t *dbgnv;
+ boolean_t extraverbose = B_FALSE;
/* check options */
- while ((c = getopt(argc, argv, ":i:I:Rv")) != -1) {
+ while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -2499,13 +3313,21 @@ zfs_do_send(int argc, char **argv)
if (fromname)
usage(B_FALSE);
fromname = optarg;
- doall = B_TRUE;
+ flags.doall = B_TRUE;
break;
case 'R':
- replicate = B_TRUE;
+ flags.replicate = B_TRUE;
+ break;
+ case 'p':
+ flags.props = B_TRUE;
break;
case 'v':
- verbose = B_TRUE;
+ if (flags.verbose)
+ extraverbose = B_TRUE;
+ flags.verbose = B_TRUE;
+ break;
+ case 'D':
+ flags.dedup = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
@@ -2565,7 +3387,7 @@ zfs_do_send(int argc, char **argv)
if (strcmp(origin, fromname) == 0) {
fromname = NULL;
- fromorigin = B_TRUE;
+ flags.fromorigin = B_TRUE;
} else {
*cp = '\0';
if (cp != fromname && strcmp(argv[0], fromname)) {
@@ -2583,18 +3405,29 @@ zfs_do_send(int argc, char **argv)
}
}
- if (replicate && fromname == NULL)
- doall = B_TRUE;
+ if (flags.replicate && fromname == NULL)
+ flags.doall = B_TRUE;
+
+ err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0,
+ extraverbose ? &dbgnv : NULL);
- err = zfs_send(zhp, fromname, toname, replicate, doall, fromorigin,
- verbose, STDOUT_FILENO);
+ if (extraverbose) {
+ /*
+ * dump_nvlist prints to stdout, but that's been
+ * redirected to a file. Make it print to stderr
+ * instead.
+ */
+ (void) dup2(STDERR_FILENO, STDOUT_FILENO);
+ dump_nvlist(dbgnv, 0);
+ nvlist_free(dbgnv);
+ }
zfs_close(zhp);
return (err != 0);
}
/*
- * zfs receive [-dnvF] <fs@snap>
+ * zfs receive [-vnFu] [-d | -e] <fs@snap>
*
* Restore a backup stream from stdin.
*/
@@ -2602,15 +3435,18 @@ static int
zfs_do_receive(int argc, char **argv)
{
int c, err;
- recvflags_t flags;
+ recvflags_t flags = { 0 };
- bzero(&flags, sizeof (recvflags_t));
/* check options */
- while ((c = getopt(argc, argv, ":dnuvF")) != -1) {
+ while ((c = getopt(argc, argv, ":denuvF")) != -1) {
switch (c) {
case 'd':
flags.isprefix = B_TRUE;
break;
+ case 'e':
+ flags.isprefix = B_TRUE;
+ flags.istail = B_TRUE;
+ break;
case 'n':
flags.dryrun = B_TRUE;
break;
@@ -2661,13 +3497,1652 @@ zfs_do_receive(int argc, char **argv)
return (err != 0);
}
-typedef struct get_all_cbdata {
- zfs_handle_t **cb_handles;
- size_t cb_alloc;
- size_t cb_used;
- uint_t cb_types;
- boolean_t cb_verbose;
-} get_all_cbdata_t;
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define ZFS_DELEG_PERM_CREATE "create"
+#define ZFS_DELEG_PERM_DESTROY "destroy"
+#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
+#define ZFS_DELEG_PERM_ROLLBACK "rollback"
+#define ZFS_DELEG_PERM_CLONE "clone"
+#define ZFS_DELEG_PERM_PROMOTE "promote"
+#define ZFS_DELEG_PERM_RENAME "rename"
+#define ZFS_DELEG_PERM_MOUNT "mount"
+#define ZFS_DELEG_PERM_SHARE "share"
+#define ZFS_DELEG_PERM_SEND "send"
+#define ZFS_DELEG_PERM_RECEIVE "receive"
+#define ZFS_DELEG_PERM_ALLOW "allow"
+#define ZFS_DELEG_PERM_USERPROP "userprop"
+#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */
+#define ZFS_DELEG_PERM_USERQUOTA "userquota"
+#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
+#define ZFS_DELEG_PERM_USERUSED "userused"
+#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
+
+#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+ { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+ { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+ { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+ { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+ { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+ { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+ { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+ { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+ { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+ { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+ { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+ { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+ { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+ { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+ { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+
+ { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+ { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+ { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+ { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+ { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+ { NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+/* permission structure */
+typedef struct deleg_perm {
+ zfs_deleg_who_type_t dp_who_type;
+ const char *dp_name;
+ boolean_t dp_local;
+ boolean_t dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+ deleg_perm_t dpn_perm;
+
+ uu_avl_node_t dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+ zfs_deleg_who_type_t who_type;
+ const char *who_name; /* id */
+ char who_ug_name[256]; /* user/group name */
+ fs_perm_t *who_fsperm; /* uplink */
+
+ uu_avl_t *who_deleg_perm_avl; /* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+ who_perm_t who_perm;
+ uu_avl_node_t who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+ const char *fsp_name;
+
+ uu_avl_t *fsp_sc_avl; /* sets,create */
+ uu_avl_t *fsp_uge_avl; /* user,group,everyone */
+
+ fs_perm_set_t *fsp_set; /* uplink */
+};
+
+/* */
+typedef struct fs_perm_node {
+ fs_perm_t fspn_fsperm;
+ uu_avl_t *fspn_avl;
+
+ uu_list_node_t fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+ uu_list_pool_t *fsps_list_pool;
+ uu_list_t *fsps_list; /* list of fs_perms */
+
+ uu_avl_pool_t *fsps_named_set_avl_pool;
+ uu_avl_pool_t *fsps_who_perm_avl_pool;
+ uu_avl_pool_t *fsps_deleg_perm_avl_pool;
+};
+
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+ /* subcommands */
+ switch (note) {
+ /* SUBCOMMANDS */
+ /* OTHER */
+ case ZFS_DELEG_NOTE_GROUPQUOTA:
+ case ZFS_DELEG_NOTE_GROUPUSED:
+ case ZFS_DELEG_NOTE_USERPROP:
+ case ZFS_DELEG_NOTE_USERQUOTA:
+ case ZFS_DELEG_NOTE_USERUSED:
+ /* other */
+ return (gettext("other"));
+ default:
+ return (gettext("subcommand"));
+ }
+}
+
+static int inline
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+ int res;
+ switch (who_type) {
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ res = 0;
+ break;
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_CREATE:
+ res = 1;
+ break;
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ res = 2;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ res = 3;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ res = 4;
+ break;
+ default:
+ res = -1;
+ }
+
+ return (res);
+}
+
+/* ARGSUSED */
+static int
+who_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+ const who_perm_node_t *l = larg;
+ const who_perm_node_t *r = rarg;
+ zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+ zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+ int lweight = who_type2weight(ltype);
+ int rweight = who_type2weight(rtype);
+ int res = lweight - rweight;
+ if (res == 0)
+ res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+ ZFS_MAX_DELEG_NAME-1);
+
+ if (res == 0)
+ return (0);
+ if (res > 0)
+ return (1);
+ else
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+ const deleg_perm_node_t *l = larg;
+ const deleg_perm_node_t *r = rarg;
+ int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+ ZFS_MAX_DELEG_NAME-1);
+
+ if (res == 0)
+ return (0);
+
+ if (res > 0)
+ return (1);
+ else
+ return (-1);
+}
+
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
+{
+ bzero(fspset, sizeof (fs_perm_set_t));
+
+ if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+ sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+ NULL, UU_DEFAULT)) == NULL)
+ nomem();
+ if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+ "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+ who_perm_node_t, who_avl_node), who_perm_compare,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+ "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+ who_perm_node_t, who_avl_node), who_perm_compare,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+ "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+ deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+ == NULL)
+ nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+ fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+ while (node != NULL) {
+ fs_perm_node_t *next_node =
+ uu_list_next(fspset->fsps_list, node);
+ fs_perm_t *fsperm = &node->fspn_fsperm;
+ fs_perm_fini(fsperm);
+ uu_list_remove(fspset->fsps_list, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+ uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+ uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+ const char *name)
+{
+ deleg_perm->dp_who_type = type;
+ deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+ zfs_deleg_who_type_t type, const char *name)
+{
+ uu_avl_pool_t *pool;
+ pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+ bzero(who_perm, sizeof (who_perm_t));
+
+ if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ who_perm->who_type = type;
+ who_perm->who_name = name;
+ who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+ deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+ while (node != NULL) {
+ deleg_perm_node_t *next_node =
+ uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+ uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+ uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool;
+ uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool;
+
+ bzero(fsperm, sizeof (fs_perm_t));
+
+ if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+ == NULL)
+ nomem();
+
+ if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+ == NULL)
+ nomem();
+
+ fsperm->fsp_set = fspset;
+ fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+ who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+ while (node != NULL) {
+ who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+ node);
+ who_perm_t *who_perm = &node->who_perm;
+ who_perm_fini(who_perm);
+ uu_avl_remove(fsperm->fsp_sc_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ node = uu_avl_first(fsperm->fsp_uge_avl);
+ while (node != NULL) {
+ who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+ node);
+ who_perm_t *who_perm = &node->who_perm;
+ who_perm_fini(who_perm);
+ uu_avl_remove(fsperm->fsp_uge_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_destroy(fsperm->fsp_sc_avl);
+ uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void inline
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+ zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+ uu_avl_index_t idx = 0;
+
+ deleg_perm_node_t *found_node = NULL;
+ deleg_perm_t *deleg_perm = &node->dpn_perm;
+
+ deleg_perm_init(deleg_perm, who_type, name);
+
+ if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+ == NULL)
+ uu_avl_insert(avl, node, idx);
+ else {
+ node = found_node;
+ deleg_perm = &node->dpn_perm;
+ }
+
+
+ switch (locality) {
+ case ZFS_DELEG_LOCAL:
+ deleg_perm->dp_local = B_TRUE;
+ break;
+ case ZFS_DELEG_DESCENDENT:
+ deleg_perm->dp_descend = B_TRUE;
+ break;
+ case ZFS_DELEG_NA:
+ break;
+ default:
+ assert(B_FALSE); /* invalid locality */
+ }
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+ nvpair_t *nvp = NULL;
+ fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+ uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ const char *name = nvpair_name(nvp);
+ data_type_t type = nvpair_type(nvp);
+ uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+ deleg_perm_node_t *node =
+ safe_malloc(sizeof (deleg_perm_node_t));
+
+ assert(type == DATA_TYPE_BOOLEAN);
+
+ uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+ set_deleg_perm_node(avl, node, who_type, name, locality);
+ }
+
+ return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+ nvpair_t *nvp = NULL;
+ fs_perm_set_t *fspset = fsperm->fsp_set;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ nvlist_t *nvl2 = NULL;
+ const char *name = nvpair_name(nvp);
+ uu_avl_t *avl = NULL;
+ uu_avl_pool_t *avl_pool;
+ zfs_deleg_who_type_t perm_type = name[0];
+ char perm_locality = name[1];
+ const char *perm_name = name + 3;
+ boolean_t is_set = B_TRUE;
+ who_perm_t *who_perm = NULL;
+
+ assert('$' == name[2]);
+
+ if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+ return (-1);
+
+ switch (perm_type) {
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ avl_pool = fspset->fsps_named_set_avl_pool;
+ avl = fsperm->fsp_sc_avl;
+ break;
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ avl_pool = fspset->fsps_who_perm_avl_pool;
+ avl = fsperm->fsp_uge_avl;
+ break;
+ }
+
+ if (is_set) {
+ who_perm_node_t *found_node = NULL;
+ who_perm_node_t *node = safe_malloc(
+ sizeof (who_perm_node_t));
+ who_perm = &node->who_perm;
+ uu_avl_index_t idx = 0;
+
+ uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+ who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+ if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+ == NULL) {
+ if (avl == fsperm->fsp_uge_avl) {
+ uid_t rid = 0;
+ struct passwd *p = NULL;
+ struct group *g = NULL;
+ const char *nice_name = NULL;
+
+ switch (perm_type) {
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ rid = atoi(perm_name);
+ p = getpwuid(rid);
+ if (p)
+ nice_name = p->pw_name;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ rid = atoi(perm_name);
+ g = getgrgid(rid);
+ if (g)
+ nice_name = g->gr_name;
+ break;
+ }
+
+ if (nice_name != NULL)
+ (void) strlcpy(
+ node->who_perm.who_ug_name,
+ nice_name, 256);
+ }
+
+ uu_avl_insert(avl, node, idx);
+ } else {
+ node = found_node;
+ who_perm = &node->who_perm;
+ }
+ }
+
+ (void) parse_who_perm(who_perm, nvl2, perm_locality);
+ }
+
+ return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+ nvpair_t *nvp = NULL;
+ uu_avl_index_t idx = 0;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ nvlist_t *nvl2 = NULL;
+ const char *fsname = nvpair_name(nvp);
+ data_type_t type = nvpair_type(nvp);
+ fs_perm_t *fsperm = NULL;
+ fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+ if (node == NULL)
+ nomem();
+
+ fsperm = &node->fspn_fsperm;
+
+ assert(DATA_TYPE_NVLIST == type);
+
+ uu_list_node_init(node, &node->fspn_list_node,
+ fspset->fsps_list_pool);
+
+ idx = uu_list_numnodes(fspset->fsps_list);
+ fs_perm_init(fsperm, fspset, fsname);
+
+ if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+ return (-1);
+
+ (void) parse_fs_perm(fsperm, nvl2);
+
+ uu_list_insert(fspset->fsps_list, node, idx);
+ }
+
+ return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+ const char *str = "";
+
+ /* subcommands */
+ switch (note) {
+ /* SUBCOMMANDS */
+ case ZFS_DELEG_NOTE_ALLOW:
+ str = gettext("Must also have the permission that is being"
+ "\n\t\t\t\tallowed");
+ break;
+ case ZFS_DELEG_NOTE_CLONE:
+ str = gettext("Must also have the 'create' ability and 'mount'"
+ "\n\t\t\t\tability in the origin file system");
+ break;
+ case ZFS_DELEG_NOTE_CREATE:
+ str = gettext("Must also have the 'mount' ability");
+ break;
+ case ZFS_DELEG_NOTE_DESTROY:
+ str = gettext("Must also have the 'mount' ability");
+ break;
+ case ZFS_DELEG_NOTE_DIFF:
+ str = gettext("Allows lookup of paths within a dataset;"
+ "\n\t\t\t\tgiven an object number. Ordinary users need this"
+ "\n\t\t\t\tin order to use zfs diff");
+ break;
+ case ZFS_DELEG_NOTE_HOLD:
+ str = gettext("Allows adding a user hold to a snapshot");
+ break;
+ case ZFS_DELEG_NOTE_MOUNT:
+ str = gettext("Allows mount/umount of ZFS datasets");
+ break;
+ case ZFS_DELEG_NOTE_PROMOTE:
+ str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+ " 'promote' ability in the origin file system");
+ break;
+ case ZFS_DELEG_NOTE_RECEIVE:
+ str = gettext("Must also have the 'mount' and 'create'"
+ " ability");
+ break;
+ case ZFS_DELEG_NOTE_RELEASE:
+ str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+ "might destroy the snapshot");
+ break;
+ case ZFS_DELEG_NOTE_RENAME:
+ str = gettext("Must also have the 'mount' and 'create'"
+ "\n\t\t\t\tability in the new parent");
+ break;
+ case ZFS_DELEG_NOTE_ROLLBACK:
+ str = gettext("");
+ break;
+ case ZFS_DELEG_NOTE_SEND:
+ str = gettext("");
+ break;
+ case ZFS_DELEG_NOTE_SHARE:
+ str = gettext("Allows sharing file systems over NFS or SMB"
+ "\n\t\t\t\tprotocols");
+ break;
+ case ZFS_DELEG_NOTE_SNAPSHOT:
+ str = gettext("");
+ break;
+/*
+ * case ZFS_DELEG_NOTE_VSCAN:
+ * str = gettext("");
+ * break;
+ */
+ /* OTHER */
+ case ZFS_DELEG_NOTE_GROUPQUOTA:
+ str = gettext("Allows accessing any groupquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_GROUPUSED:
+ str = gettext("Allows reading any groupused@... property");
+ break;
+ case ZFS_DELEG_NOTE_USERPROP:
+ str = gettext("Allows changing any user property");
+ break;
+ case ZFS_DELEG_NOTE_USERQUOTA:
+ str = gettext("Allows accessing any userquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_USERUSED:
+ str = gettext("Allows reading any userused@... property");
+ break;
+ /* other */
+ default:
+ str = "";
+ }
+
+ return (str);
+}
+
+struct allow_opts {
+ boolean_t local;
+ boolean_t descend;
+ boolean_t user;
+ boolean_t group;
+ boolean_t everyone;
+ boolean_t create;
+ boolean_t set;
+ boolean_t recursive; /* unallow only */
+ boolean_t prt_usage;
+
+ boolean_t prt_perms;
+ char *who;
+ char *perms;
+ const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+ const char *str1 = *(const char **)a;
+ const char *str2 = *(const char **)b;
+ return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+ const char *opt_desc[] = {
+ "-h", gettext("show this help message and exit"),
+ "-l", gettext("set permission locally"),
+ "-d", gettext("set permission for descents"),
+ "-u", gettext("set permission for user"),
+ "-g", gettext("set permission for group"),
+ "-e", gettext("set permission for everyone"),
+ "-c", gettext("set create time permission"),
+ "-s", gettext("define permission set"),
+ /* unallow only */
+ "-r", gettext("remove permissions recursively"),
+ };
+ size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+ size_t allow_size = unallow_size - 2;
+ const char *props[ZFS_NUM_PROPS];
+ int i;
+ size_t count = 0;
+ FILE *fp = requested ? stdout : stderr;
+ zprop_desc_t *pdtbl = zfs_prop_get_table();
+ const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+ (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+ HELP_ALLOW));
+ (void) fprintf(fp, gettext("Options:\n"));
+ for (i = 0; i < (un ? unallow_size : allow_size); i++) {
+ const char *opt = opt_desc[i++];
+ const char *optdsc = opt_desc[i];
+ (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc);
+ }
+
+ (void) fprintf(fp, gettext("\nThe following permissions are "
+ "supported:\n\n"));
+ (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+ gettext("NOTES"));
+ for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+ const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+ zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+ const char *perm_type = deleg_perm_type(perm_note);
+ const char *perm_comment = deleg_perm_comment(perm_note);
+ (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+ }
+
+ for (i = 0; i < ZFS_NUM_PROPS; i++) {
+ zprop_desc_t *pd = &pdtbl[i];
+ if (pd->pd_visible != B_TRUE)
+ continue;
+
+ if (pd->pd_attr == PROP_READONLY)
+ continue;
+
+ props[count++] = pd->pd_name;
+ }
+ props[count] = NULL;
+
+ qsort(props, count, sizeof (char *), prop_cmp);
+
+ for (i = 0; i < count; i++)
+ (void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+ if (msg != NULL)
+ (void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+ exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+ char **permsp)
+{
+ if (un && argc == expected_argc - 1)
+ *permsp = NULL;
+ else if (argc == expected_argc)
+ *permsp = argv[argc - 2];
+ else
+ allow_usage(un, B_FALSE,
+ gettext("wrong number of parameters\n"));
+
+ return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+ int uge_sum = opts->user + opts->group + opts->everyone;
+ int csuge_sum = opts->create + opts->set + uge_sum;
+ int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+ int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+ if (uge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("-u, -g, and -e are mutually exclusive\n"));
+
+ if (opts->prt_usage)
+ if (argc == 0 && all_sum == 0)
+ allow_usage(un, B_TRUE, NULL);
+ else
+ usage(B_FALSE);
+
+ if (opts->set) {
+ if (csuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -s\n"));
+
+ opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+ if (argv[0][0] != '@')
+ allow_usage(un, B_FALSE,
+ gettext("invalid set name: missing '@' prefix\n"));
+ opts->who = argv[0];
+ } else if (opts->create) {
+ if (ldcsuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -c\n"));
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (opts->everyone) {
+ if (csuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -e\n"));
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+ == 0) {
+ opts->everyone = B_TRUE;
+ argc--;
+ argv++;
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (argc == 1) {
+ opts->prt_perms = B_TRUE;
+ opts->dataset = argv[argc-1];
+ } else {
+ opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+ opts->who = argv[0];
+ }
+
+ if (!opts->local && !opts->descend) {
+ opts->local = B_TRUE;
+ opts->descend = B_TRUE;
+ }
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+ const char *who, char *perms, nvlist_t *top_nvl)
+{
+ int i;
+ char ld[2] = { '\0', '\0' };
+ char who_buf[ZFS_MAXNAMELEN+32];
+ char base_type;
+ char set_type;
+ nvlist_t *base_nvl = NULL;
+ nvlist_t *set_nvl = NULL;
+ nvlist_t *nvl;
+
+ if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+ if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ switch (type) {
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ set_type = ZFS_DELEG_NAMED_SET_SETS;
+ base_type = ZFS_DELEG_NAMED_SET;
+ ld[0] = ZFS_DELEG_NA;
+ break;
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_CREATE:
+ set_type = ZFS_DELEG_CREATE_SETS;
+ base_type = ZFS_DELEG_CREATE;
+ ld[0] = ZFS_DELEG_NA;
+ break;
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ set_type = ZFS_DELEG_USER_SETS;
+ base_type = ZFS_DELEG_USER;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ set_type = ZFS_DELEG_GROUP_SETS;
+ base_type = ZFS_DELEG_GROUP;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ set_type = ZFS_DELEG_EVERYONE_SETS;
+ base_type = ZFS_DELEG_EVERYONE;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ }
+
+ if (perms != NULL) {
+ char *curr = perms;
+ char *end = curr + strlen(perms);
+
+ while (curr < end) {
+ char *delim = strchr(curr, ',');
+ if (delim == NULL)
+ delim = end;
+ else
+ *delim = '\0';
+
+ if (curr[0] == '@')
+ nvl = set_nvl;
+ else
+ nvl = base_nvl;
+
+ (void) nvlist_add_boolean(nvl, curr);
+ if (delim != end)
+ *delim = ',';
+ curr = delim + 1;
+ }
+
+ for (i = 0; i < 2; i++) {
+ char locality = ld[i];
+ if (locality == 0)
+ continue;
+
+ if (!nvlist_empty(base_nvl)) {
+ if (who != NULL)
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$%s",
+ base_type, locality, who);
+ else
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$",
+ base_type, locality);
+
+ (void) nvlist_add_nvlist(top_nvl, who_buf,
+ base_nvl);
+ }
+
+
+ if (!nvlist_empty(set_nvl)) {
+ if (who != NULL)
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$%s",
+ set_type, locality, who);
+ else
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$",
+ set_type, locality);
+
+ (void) nvlist_add_nvlist(top_nvl, who_buf,
+ set_nvl);
+ }
+ }
+ } else {
+ for (i = 0; i < 2; i++) {
+ char locality = ld[i];
+ if (locality == 0)
+ continue;
+
+ if (who != NULL)
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$%s", base_type, locality, who);
+ else
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$", base_type, locality);
+ (void) nvlist_add_boolean(top_nvl, who_buf);
+
+ if (who != NULL)
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$%s", set_type, locality, who);
+ else
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$", set_type, locality);
+ (void) nvlist_add_boolean(top_nvl, who_buf);
+ }
+ }
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+ if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ if (opts->set) {
+ store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+ opts->descend, opts->who, opts->perms, *nvlp);
+ } else if (opts->create) {
+ store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+ opts->descend, NULL, opts->perms, *nvlp);
+ } else if (opts->everyone) {
+ store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+ opts->descend, NULL, opts->perms, *nvlp);
+ } else {
+ char *curr = opts->who;
+ char *end = curr + strlen(curr);
+
+ while (curr < end) {
+ const char *who;
+ zfs_deleg_who_type_t who_type;
+ char *endch;
+ char *delim = strchr(curr, ',');
+ char errbuf[256];
+ char id[64];
+ struct passwd *p = NULL;
+ struct group *g = NULL;
+
+ uid_t rid;
+ if (delim == NULL)
+ delim = end;
+ else
+ *delim = '\0';
+
+ rid = (uid_t)strtol(curr, &endch, 0);
+ if (opts->user) {
+ who_type = ZFS_DELEG_USER;
+ if (*endch != '\0')
+ p = getpwnam(curr);
+ else
+ p = getpwuid(rid);
+
+ if (p != NULL)
+ rid = p->pw_uid;
+ else {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user %s"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ } else if (opts->group) {
+ who_type = ZFS_DELEG_GROUP;
+ if (*endch != '\0')
+ g = getgrnam(curr);
+ else
+ g = getgrgid(rid);
+
+ if (g != NULL)
+ rid = g->gr_gid;
+ else {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid group %s"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ } else {
+ if (*endch != '\0') {
+ p = getpwnam(curr);
+ } else {
+ p = getpwuid(rid);
+ }
+
+ if (p == NULL)
+ if (*endch != '\0') {
+ g = getgrnam(curr);
+ } else {
+ g = getgrgid(rid);
+ }
+
+ if (p != NULL) {
+ who_type = ZFS_DELEG_USER;
+ rid = p->pw_uid;
+ } else if (g != NULL) {
+ who_type = ZFS_DELEG_GROUP;
+ rid = g->gr_gid;
+ } else {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user/group %s"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ }
+
+ (void) sprintf(id, "%u", rid);
+ who = id;
+
+ store_allow_perm(who_type, opts->local,
+ opts->descend, who, opts->perms, *nvlp);
+ curr = delim + 1;
+ }
+ }
+
+ return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+ const char *sc_title[] = {
+ gettext("Permission sets:\n"),
+ gettext("Create time permissions:\n"),
+ NULL
+ };
+ const char **title_ptr = sc_title;
+ who_perm_node_t *who_node = NULL;
+ int prev_weight = -1;
+
+ for (who_node = uu_avl_first(who_avl); who_node != NULL;
+ who_node = uu_avl_next(who_avl, who_node)) {
+ uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+ const char *who_name = who_node->who_perm.who_name;
+ int weight = who_type2weight(who_type);
+ boolean_t first = B_TRUE;
+ deleg_perm_node_t *deleg_node;
+
+ if (prev_weight != weight) {
+ (void) printf(*title_ptr++);
+ prev_weight = weight;
+ }
+
+ if (who_name == NULL || strnlen(who_name, 1) == 0)
+ (void) printf("\t");
+ else
+ (void) printf("\t%s ", who_name);
+
+ for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+ deleg_node = uu_avl_next(avl, deleg_node)) {
+ if (first) {
+ (void) printf("%s",
+ deleg_node->dpn_perm.dp_name);
+ first = B_FALSE;
+ } else
+ (void) printf(",%s",
+ deleg_node->dpn_perm.dp_name);
+ }
+
+ (void) printf("\n");
+ }
+}
+
+static void inline
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+ const char *title)
+{
+ who_perm_node_t *who_node = NULL;
+ boolean_t prt_title = B_TRUE;
+ uu_avl_walk_t *walk;
+
+ if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+ const char *who_name = who_node->who_perm.who_name;
+ const char *nice_who_name = who_node->who_perm.who_ug_name;
+ uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+ char delim = ' ';
+ deleg_perm_node_t *deleg_node;
+ boolean_t prt_who = B_TRUE;
+
+ for (deleg_node = uu_avl_first(avl);
+ deleg_node != NULL;
+ deleg_node = uu_avl_next(avl, deleg_node)) {
+ if (local != deleg_node->dpn_perm.dp_local ||
+ descend != deleg_node->dpn_perm.dp_descend)
+ continue;
+
+ if (prt_who) {
+ const char *who = NULL;
+ if (prt_title) {
+ prt_title = B_FALSE;
+ (void) printf(title);
+ }
+
+ switch (who_type) {
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ who = gettext("user");
+ if (nice_who_name)
+ who_name = nice_who_name;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ who = gettext("group");
+ if (nice_who_name)
+ who_name = nice_who_name;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ who = gettext("everyone");
+ who_name = NULL;
+ }
+
+ prt_who = B_FALSE;
+ if (who_name == NULL)
+ (void) printf("\t%s", who);
+ else
+ (void) printf("\t%s %s", who, who_name);
+ }
+
+ (void) printf("%c%s", delim,
+ deleg_node->dpn_perm.dp_name);
+ delim = ',';
+ }
+
+ if (!prt_who)
+ (void) printf("\n");
+ }
+
+ uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+ fs_perm_node_t *node = NULL;
+ char buf[ZFS_MAXNAMELEN+32];
+ const char *dsname = buf;
+
+ for (node = uu_list_first(fspset->fsps_list); node != NULL;
+ node = uu_list_next(fspset->fsps_list, node)) {
+ uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+ uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+ int left = 0;
+
+ (void) snprintf(buf, ZFS_MAXNAMELEN+32,
+ gettext("---- Permissions on %s "),
+ node->fspn_fsperm.fsp_name);
+ (void) printf(dsname);
+ left = 70 - strlen(buf);
+ while (left-- > 0)
+ (void) printf("-");
+ (void) printf("\n");
+
+ print_set_creat_perms(sc_avl);
+ print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+ gettext("Local permissions:\n"));
+ print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+ gettext("Descendent permissions:\n"));
+ print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+ gettext("Local+Descendent permissions:\n"));
+ }
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+ boolean_t un;
+ nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+ struct deleg_perms *perms = (struct deleg_perms *)data;
+ zfs_type_t zfs_type = zfs_get_type(zhp);
+
+ if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+ return (0);
+
+ return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+ zfs_handle_t *zhp;
+ nvlist_t *perm_nvl = NULL;
+ nvlist_t *update_perm_nvl = NULL;
+ int error = 1;
+ int c;
+ struct allow_opts opts = { 0 };
+
+ const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+ /* check opts */
+ while ((c = getopt(argc, argv, optstr)) != -1) {
+ switch (c) {
+ case 'l':
+ opts.local = B_TRUE;
+ break;
+ case 'd':
+ opts.descend = B_TRUE;
+ break;
+ case 'u':
+ opts.user = B_TRUE;
+ break;
+ case 'g':
+ opts.group = B_TRUE;
+ break;
+ case 'e':
+ opts.everyone = B_TRUE;
+ break;
+ case 's':
+ opts.set = B_TRUE;
+ break;
+ case 'c':
+ opts.create = B_TRUE;
+ break;
+ case 'r':
+ opts.recursive = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case 'h':
+ opts.prt_usage = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ parse_allow_args(argc, argv, un, &opts);
+
+ /* try to open the dataset */
+ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM))
+ == NULL) {
+ (void) fprintf(stderr, "Failed to open Dataset *%s*\n",
+ opts.dataset);
+ return (-1);
+ }
+
+ if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+ goto cleanup2;
+
+ fs_perm_set_init(&fs_perm_set);
+ if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+ (void) fprintf(stderr, "Failed to parse fsacl permissionsn");
+ goto cleanup1;
+ }
+
+ if (opts.prt_perms)
+ print_fs_perms(&fs_perm_set);
+ else {
+ (void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+ if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+ goto cleanup0;
+
+ if (un && opts.recursive) {
+ struct deleg_perms data = { un, update_perm_nvl };
+ if (zfs_iter_filesystems(zhp, set_deleg_perms,
+ &data) != 0)
+ goto cleanup0;
+ }
+ }
+
+ error = 0;
+
+cleanup0:
+ nvlist_free(perm_nvl);
+ if (update_perm_nvl != NULL)
+ nvlist_free(update_perm_nvl);
+cleanup1:
+ fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+ zfs_close(zhp);
+
+ return (error);
+}
+
+/*
+ * zfs allow [-r] [-t] <tag> <snap> ...
+ *
+ * -r Recursively hold
+ * -t Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_allow(int argc, char **argv)
+{
+ return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+/*
+ * zfs unallow [-r] [-t] <tag> <snap> ...
+ *
+ * -r Recursively hold
+ * -t Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+ return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+ int errors = 0;
+ int i;
+ const char *tag;
+ boolean_t recursive = B_FALSE;
+ boolean_t temphold = B_FALSE;
+ const char *opts = holding ? "rt" : "r";
+ int c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, opts)) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case 't':
+ temphold = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 2)
+ usage(B_FALSE);
+
+ tag = argv[0];
+ --argc;
+ ++argv;
+
+ if (holding && tag[0] == '.') {
+ /* tags starting with '.' are reserved for libzfs */
+ (void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+ usage(B_FALSE);
+ }
+
+ for (i = 0; i < argc; ++i) {
+ zfs_handle_t *zhp;
+ char parent[ZFS_MAXNAMELEN];
+ const char *delim;
+ char *path = argv[i];
+
+ delim = strchr(path, '@');
+ if (delim == NULL) {
+ (void) fprintf(stderr,
+ gettext("'%s' is not a snapshot\n"), path);
+ ++errors;
+ continue;
+ }
+ (void) strncpy(parent, path, delim - path);
+ parent[delim - path] = '\0';
+
+ zhp = zfs_open(g_zfs, parent,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL) {
+ ++errors;
+ continue;
+ }
+ if (holding) {
+ if (zfs_hold(zhp, delim+1, tag, recursive,
+ temphold, B_FALSE, -1, 0, 0) != 0)
+ ++errors;
+ } else {
+ if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+ ++errors;
+ }
+ zfs_close(zhp);
+ }
+
+ return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ * -r Recursively hold
+ * -t Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ * -r Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
+typedef struct holds_cbdata {
+ boolean_t cb_recursive;
+ const char *cb_snapname;
+ nvlist_t **cb_nvlp;
+ size_t cb_max_namelen;
+ size_t cb_max_taglen;
+} holds_cbdata_t;
+
+#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
+#define DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, size_t nwidth, size_t tagwidth, nvlist_t *nvl)
+{
+ int i;
+ nvpair_t *nvp = NULL;
+ char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+ const char *col;
+
+ if (!scripted) {
+ for (i = 0; i < 3; i++) {
+ col = gettext(hdr_cols[i]);
+ if (i < 2)
+ (void) printf("%-*s ", i ? tagwidth : nwidth,
+ col);
+ else
+ (void) printf("%s\n", col);
+ }
+ }
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ char *zname = nvpair_name(nvp);
+ nvlist_t *nvl2;
+ nvpair_t *nvp2 = NULL;
+ (void) nvpair_value_nvlist(nvp, &nvl2);
+ while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+ char tsbuf[DATETIME_BUF_LEN];
+ char *tagname = nvpair_name(nvp2);
+ uint64_t val = 0;
+ time_t time;
+ struct tm t;
+ char sep = scripted ? '\t' : ' ';
+ size_t sepnum = scripted ? 1 : 2;
+
+ (void) nvpair_value_uint64(nvp2, &val);
+ time = (time_t)val;
+ (void) localtime_r(&time, &t);
+ (void) strftime(tsbuf, DATETIME_BUF_LEN,
+ gettext(STRFTIME_FMT_STR), &t);
+
+ (void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname,
+ sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf);
+ }
+ }
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+ holds_cbdata_t *cbp = data;
+ nvlist_t *top_nvl = *cbp->cb_nvlp;
+ nvlist_t *nvl = NULL;
+ nvpair_t *nvp = NULL;
+ const char *zname = zfs_get_name(zhp);
+ size_t znamelen = strnlen(zname, ZFS_MAXNAMELEN);
+
+ if (cbp->cb_recursive) {
+ const char *snapname;
+ char *delim = strchr(zname, '@');
+ if (delim == NULL)
+ return (0);
+
+ snapname = delim + 1;
+ if (strcmp(cbp->cb_snapname, snapname))
+ return (0);
+ }
+
+ if (zfs_get_holds(zhp, &nvl) != 0)
+ return (-1);
+
+ if (znamelen > cbp->cb_max_namelen)
+ cbp->cb_max_namelen = znamelen;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ const char *tag = nvpair_name(nvp);
+ size_t taglen = strnlen(tag, MAXNAMELEN);
+ if (taglen > cbp->cb_max_taglen)
+ cbp->cb_max_taglen = taglen;
+ }
+
+ return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-r] <snap> ...
+ *
+ * -r Recursively hold
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+ int errors = 0;
+ int c;
+ int i;
+ boolean_t scripted = B_FALSE;
+ boolean_t recursive = B_FALSE;
+ const char *opts = "rH";
+ nvlist_t *nvl;
+
+ int types = ZFS_TYPE_SNAPSHOT;
+ holds_cbdata_t cb = { 0 };
+
+ int limit = 0;
+ int ret;
+ int flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, opts)) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (recursive) {
+ types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+ flags |= ZFS_ITER_RECURSE;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1)
+ usage(B_FALSE);
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ for (i = 0; i < argc; ++i) {
+ char *snapshot = argv[i];
+ const char *delim;
+ const char *snapname;
+
+ delim = strchr(snapshot, '@');
+ if (delim == NULL) {
+ (void) fprintf(stderr,
+ gettext("'%s' is not a snapshot\n"), snapshot);
+ ++errors;
+ continue;
+ }
+ snapname = delim + 1;
+ if (recursive)
+ snapshot[delim - snapshot] = '\0';
+
+ cb.cb_recursive = recursive;
+ cb.cb_snapname = snapname;
+ cb.cb_nvlp = &nvl;
+
+ /*
+ * 1. collect holds data, set format options
+ */
+ ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+ holds_callback, &cb);
+ if (ret != 0)
+ ++errors;
+ }
+
+ /*
+ * 2. print holds data
+ */
+ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl);
+
+ if (nvlist_empty(nvl))
+ (void) printf(gettext("no datasets available\n"));
+
+ nvlist_free(nvl);
+
+ return (0 != errors);
+}
#define CHECK_SPINNER 30
#define SPINNER_TIME 3 /* seconds */
@@ -2676,19 +5151,18 @@ typedef struct get_all_cbdata {
static int
get_one_dataset(zfs_handle_t *zhp, void *data)
{
- static char spin[] = { '-', '\\', '|', '/' };
+ static char *spin[] = { "-", "\\", "|", "/" };
static int spinval = 0;
static int spincheck = 0;
static time_t last_spin_time = (time_t)0;
- get_all_cbdata_t *cbp = data;
+ get_all_cb_t *cbp = data;
zfs_type_t type = zfs_get_type(zhp);
if (cbp->cb_verbose) {
if (--spincheck < 0) {
time_t now = time(NULL);
if (last_spin_time + SPINNER_TIME < now) {
- (void) printf("\b%c", spin[spinval++ % 4]);
- (void) fflush(stdout);
+ update_progress(spin[spinval++ % 4]);
last_spin_time = now;
}
spincheck = CHECK_SPINNER;
@@ -2698,8 +5172,7 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
/*
* Interate over any nested datasets.
*/
- if (type == ZFS_TYPE_FILESYSTEM &&
- zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
zfs_close(zhp);
return (1);
}
@@ -2707,83 +5180,32 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
/*
* Skip any datasets whose type does not match.
*/
- if ((type & cbp->cb_types) == 0) {
+ if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
zfs_close(zhp);
return (0);
}
-
- if (cbp->cb_alloc == cbp->cb_used) {
- zfs_handle_t **handles;
-
- if (cbp->cb_alloc == 0)
- cbp->cb_alloc = 64;
- else
- cbp->cb_alloc *= 2;
-
- handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
-
- if (cbp->cb_handles) {
- bcopy(cbp->cb_handles, handles,
- cbp->cb_used * sizeof (void *));
- free(cbp->cb_handles);
- }
-
- cbp->cb_handles = handles;
- }
-
- cbp->cb_handles[cbp->cb_used++] = zhp;
+ libzfs_add_handle(cbp, zhp);
+ assert(cbp->cb_used <= cbp->cb_alloc);
return (0);
}
static void
-get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count,
- boolean_t verbose)
+get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
{
- get_all_cbdata_t cb = { 0 };
- cb.cb_types = types;
+ get_all_cb_t cb = { 0 };
cb.cb_verbose = verbose;
+ cb.cb_getone = get_one_dataset;
- if (verbose) {
- (void) printf("%s: *", gettext("Reading ZFS config"));
- (void) fflush(stdout);
- }
-
+ if (verbose)
+ set_progress_header(gettext("Reading ZFS config"));
(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
*dslist = cb.cb_handles;
*count = cb.cb_used;
- if (verbose) {
- (void) printf("\b%s\n", gettext("done."));
- }
-}
-
-static int
-dataset_cmp(const void *a, const void *b)
-{
- zfs_handle_t **za = (zfs_handle_t **)a;
- zfs_handle_t **zb = (zfs_handle_t **)b;
- char mounta[MAXPATHLEN];
- char mountb[MAXPATHLEN];
- boolean_t gota, gotb;
-
- if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
- verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
- sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
- if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
- verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
- sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
-
- if (gota && gotb)
- return (strcmp(mounta, mountb));
-
- if (gota)
- return (-1);
- if (gotb)
- return (1);
-
- return (strcmp(zfs_get_name(a), zfs_get_name(b)));
+ if (verbose)
+ finish_progress(gettext("done."));
}
/*
@@ -2807,216 +5229,179 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
const char *cmdname = op == OP_SHARE ? "share" : "mount";
struct mnttab mnt;
uint64_t zoned, canmount;
- zfs_type_t type = zfs_get_type(zhp);
boolean_t shared_nfs, shared_smb;
- assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
+ assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
- if (type == ZFS_TYPE_FILESYSTEM) {
- /*
- * Check to make sure we can mount/share this dataset. If we
- * are in the global zone and the filesystem is exported to a
- * local zone, or if we are in a local zone and the
- * filesystem is not exported, then it is an error.
- */
- zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+ /*
+ * Check to make sure we can mount/share this dataset. If we
+ * are in the global zone and the filesystem is exported to a
+ * local zone, or if we are in a local zone and the
+ * filesystem is not exported, then it is an error.
+ */
+ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
- if (zoned && getzoneid() == GLOBAL_ZONEID) {
- if (!explicit)
- return (0);
+ if (zoned && getzoneid() == GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
- (void) fprintf(stderr, gettext("cannot %s '%s': "
- "dataset is exported to a local zone\n"), cmdname,
- zfs_get_name(zhp));
- return (1);
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "dataset is exported to a local zone\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
- } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
- if (!explicit)
- return (0);
+ } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
- (void) fprintf(stderr, gettext("cannot %s '%s': "
- "permission denied\n"), cmdname,
- zfs_get_name(zhp));
- return (1);
- }
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "permission denied\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ }
- /*
- * Ignore any filesystems which don't apply to us. This
- * includes those with a legacy mountpoint, or those with
- * legacy share options.
- */
- verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
- sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
- sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
- sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
-
- if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
- strcmp(smbshareopts, "off") == 0) {
- if (!explicit)
- return (0);
+ /*
+ * Ignore any filesystems which don't apply to us. This
+ * includes those with a legacy mountpoint, or those with
+ * legacy share options.
+ */
+ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+ sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+ sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+ strcmp(smbshareopts, "off") == 0) {
+ if (!explicit)
+ return (0);
- (void) fprintf(stderr, gettext("cannot share '%s': "
- "legacy share\n"), zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("use share(1M) to "
- "share this filesystem, or set "
- "sharenfs property on\n"));
- return (1);
- }
+ (void) fprintf(stderr, gettext("cannot share '%s': "
+ "legacy share\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use share(1M) to "
+ "share this filesystem, or set "
+ "sharenfs property on\n"));
+ return (1);
+ }
- /*
- * We cannot share or mount legacy filesystems. If the
- * shareopts is non-legacy but the mountpoint is legacy, we
- * treat it as a legacy share.
- */
- if (strcmp(mountpoint, "legacy") == 0) {
- if (!explicit)
- return (0);
+ /*
+ * We cannot share or mount legacy filesystems. If the
+ * shareopts is non-legacy but the mountpoint is legacy, we
+ * treat it as a legacy share.
+ */
+ if (strcmp(mountpoint, "legacy") == 0) {
+ if (!explicit)
+ return (0);
- (void) fprintf(stderr, gettext("cannot %s '%s': "
- "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("use %s(1M) to "
- "%s this filesystem\n"), cmdname, cmdname);
- return (1);
- }
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use %s(1M) to "
+ "%s this filesystem\n"), cmdname, cmdname);
+ return (1);
+ }
- if (strcmp(mountpoint, "none") == 0) {
- if (!explicit)
- return (0);
+ if (strcmp(mountpoint, "none") == 0) {
+ if (!explicit)
+ return (0);
- (void) fprintf(stderr, gettext("cannot %s '%s': no "
- "mountpoint set\n"), cmdname, zfs_get_name(zhp));
- return (1);
- }
+ (void) fprintf(stderr, gettext("cannot %s '%s': no "
+ "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+ return (1);
+ }
- /*
- * canmount explicit outcome
- * on no pass through
- * on yes pass through
- * off no return 0
- * off yes display error, return 1
- * noauto no return 0
- * noauto yes pass through
- */
- canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
- if (canmount == ZFS_CANMOUNT_OFF) {
+ /*
+ * canmount explicit outcome
+ * on no pass through
+ * on yes pass through
+ * off no return 0
+ * off yes display error, return 1
+ * noauto no return 0
+ * noauto yes pass through
+ */
+ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+ if (canmount == ZFS_CANMOUNT_OFF) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "'canmount' property is set to 'off'\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+ return (0);
+ }
+
+ /*
+ * At this point, we have verified that the mountpoint and/or
+ * shareopts are appropriate for auto management. If the
+ * filesystem is already mounted or shared, return (failing
+ * for explicit requests); otherwise mount or share the
+ * filesystem.
+ */
+ switch (op) {
+ case OP_SHARE:
+
+ shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+ shared_smb = zfs_is_shared_smb(zhp, NULL);
+
+ if (shared_nfs && shared_smb ||
+ (shared_nfs && strcmp(shareopts, "on") == 0 &&
+ strcmp(smbshareopts, "off") == 0) ||
+ (shared_smb && strcmp(smbshareopts, "on") == 0 &&
+ strcmp(shareopts, "off") == 0)) {
if (!explicit)
return (0);
- (void) fprintf(stderr, gettext("cannot %s '%s': "
- "'canmount' property is set to 'off'\n"), cmdname,
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': filesystem already shared\n"),
zfs_get_name(zhp));
return (1);
- } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
- return (0);
}
- /*
- * At this point, we have verified that the mountpoint and/or
- * shareopts are appropriate for auto management. If the
- * filesystem is already mounted or shared, return (failing
- * for explicit requests); otherwise mount or share the
- * filesystem.
- */
- switch (op) {
- case OP_SHARE:
-
- shared_nfs = zfs_is_shared_nfs(zhp, NULL);
- shared_smb = zfs_is_shared_smb(zhp, NULL);
-
- if (shared_nfs && shared_smb ||
- (shared_nfs && strcmp(shareopts, "on") == 0 &&
- strcmp(smbshareopts, "off") == 0) ||
- (shared_smb && strcmp(smbshareopts, "on") == 0 &&
- strcmp(shareopts, "off") == 0)) {
- if (!explicit)
- return (0);
-
- (void) fprintf(stderr, gettext("cannot share "
- "'%s': filesystem already shared\n"),
- zfs_get_name(zhp));
- return (1);
- }
+ if (!zfs_is_mounted(zhp, NULL) &&
+ zfs_mount(zhp, NULL, 0) != 0)
+ return (1);
- if (!zfs_is_mounted(zhp, NULL) &&
- zfs_mount(zhp, NULL, 0) != 0)
+ if (protocol == NULL) {
+ if (zfs_shareall(zhp) != 0)
return (1);
-
- if (protocol == NULL) {
- if (zfs_shareall(zhp) != 0)
- return (1);
- } else if (strcmp(protocol, "nfs") == 0) {
- if (zfs_share_nfs(zhp))
- return (1);
- } else if (strcmp(protocol, "smb") == 0) {
- if (zfs_share_smb(zhp))
- return (1);
- } else {
- (void) fprintf(stderr, gettext("cannot share "
- "'%s': invalid share type '%s' "
- "specified\n"),
- zfs_get_name(zhp), protocol);
+ } else if (strcmp(protocol, "nfs") == 0) {
+ if (zfs_share_nfs(zhp))
return (1);
- }
-
- break;
-
- case OP_MOUNT:
- if (options == NULL)
- mnt.mnt_mntopts = "";
- else
- mnt.mnt_mntopts = (char *)options;
-
- if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
- zfs_is_mounted(zhp, NULL)) {
- if (!explicit)
- return (0);
-
- (void) fprintf(stderr, gettext("cannot mount "
- "'%s': filesystem already mounted\n"),
- zfs_get_name(zhp));
- return (1);
- }
-
- if (zfs_mount(zhp, options, flags) != 0)
+ } else if (strcmp(protocol, "smb") == 0) {
+ if (zfs_share_smb(zhp))
return (1);
- break;
+ } else {
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': invalid share type '%s' "
+ "specified\n"),
+ zfs_get_name(zhp), protocol);
+ return (1);
}
- } else {
- assert(op == OP_SHARE);
- /*
- * Ignore any volumes that aren't shared.
- */
- verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
- sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+ break;
- if (strcmp(shareopts, "off") == 0) {
- if (!explicit)
- return (0);
-
- (void) fprintf(stderr, gettext("cannot share '%s': "
- "'shareiscsi' property not set\n"),
- zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("set 'shareiscsi' "
- "property or use iscsitadm(1M) to share this "
- "volume\n"));
- return (1);
- }
+ case OP_MOUNT:
+ if (options == NULL)
+ mnt.mnt_mntopts = "";
+ else
+ mnt.mnt_mntopts = (char *)options;
- if (zfs_is_shared_iscsi(zhp)) {
+ if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+ zfs_is_mounted(zhp, NULL)) {
if (!explicit)
return (0);
- (void) fprintf(stderr, gettext("cannot share "
- "'%s': volume already shared\n"),
+ (void) fprintf(stderr, gettext("cannot mount "
+ "'%s': filesystem already mounted\n"),
zfs_get_name(zhp));
return (1);
}
- if (zfs_share_iscsi(zhp) != 0)
+ if (zfs_mount(zhp, options, flags) != 0)
return (1);
+ break;
}
return (0);
@@ -3028,19 +5413,16 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
static void
report_mount_progress(int current, int total)
{
- static int len;
- static char *reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
- "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
- static time_t last_progress_time;
+ static time_t last_progress_time = 0;
time_t now = time(NULL);
+ char info[32];
/* report 1..n instead of 0..n-1 */
++current;
/* display header if we're here for the first time */
if (current == 1) {
- (void) printf(gettext("Mounting ZFS filesystems: "));
- len = 0;
+ set_progress_header(gettext("Mounting ZFS filesystems"));
} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
/* too soon to report again */
return;
@@ -3048,13 +5430,12 @@ report_mount_progress(int current, int total)
last_progress_time = now;
- /* back up to prepare for overwriting */
- if (len)
- (void) printf("%*.*s", len, len, reverse);
+ (void) sprintf(info, "(%d/%d)", current, total);
- /* We put a newline at the end if this is the last one. */
- len = printf("(%d/%d)%s", current, total, current == total ? "\n" : "");
- (void) fflush(stdout);
+ if (current == total)
+ finish_progress(info);
+ else
+ update_progress(info);
}
static void
@@ -3083,7 +5464,7 @@ share_mount(int op, int argc, char **argv)
boolean_t verbose = B_FALSE;
int c, ret = 0;
char *options = NULL;
- int types, flags = 0;
+ int flags = 0;
/* check options */
while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
@@ -3133,24 +5514,16 @@ share_mount(int op, int argc, char **argv)
size_t i, count = 0;
char *protocol = NULL;
- if (op == OP_MOUNT) {
- types = ZFS_TYPE_FILESYSTEM;
- } else if (argc > 0) {
- if (strcmp(argv[0], "nfs") == 0 ||
- strcmp(argv[0], "smb") == 0) {
- types = ZFS_TYPE_FILESYSTEM;
- } else if (strcmp(argv[0], "iscsi") == 0) {
- types = ZFS_TYPE_VOLUME;
- } else {
+ if (op == OP_SHARE && argc > 0) {
+ if (strcmp(argv[0], "nfs") != 0 &&
+ strcmp(argv[0], "smb") != 0) {
(void) fprintf(stderr, gettext("share type "
- "must be 'nfs', 'smb' or 'iscsi'\n"));
+ "must be 'nfs' or 'smb'\n"));
usage(B_FALSE);
}
protocol = argv[0];
argc--;
argv++;
- } else {
- types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
}
if (argc != 0) {
@@ -3158,12 +5531,13 @@ share_mount(int op, int argc, char **argv)
usage(B_FALSE);
}
- get_all_datasets(types, &dslist, &count, verbose);
+ start_progress_timer();
+ get_all_datasets(&dslist, &count, verbose);
if (count == 0)
return (0);
- qsort(dslist, count, sizeof (void *), dataset_cmp);
+ qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
for (i = 0; i < count; i++) {
if (verbose)
@@ -3177,8 +5551,7 @@ share_mount(int op, int argc, char **argv)
free(dslist);
} else if (argc == 0) {
- struct statfs *sfs;
- int i, n;
+ struct mnttab entry;
if ((op == OP_SHARE) || (options != NULL)) {
(void) fprintf(stderr, gettext("missing filesystem "
@@ -3191,33 +5564,27 @@ share_mount(int op, int argc, char **argv)
* display any active ZFS mounts. We hide any snapshots, since
* they are controlled automatically.
*/
- if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
- fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
- return (0);
- }
- for (i = 0; i < n; i++) {
- if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 ||
- strchr(sfs[i].f_mntfromname, '@') != NULL)
+ rewind(mnttab_file);
+ while (getmntent(mnttab_file, &entry) == 0) {
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+ strchr(entry.mnt_special, '@') != NULL)
continue;
- (void) printf("%-30s %s\n", sfs[i].f_mntfromname,
- sfs[i].f_mntonname);
+ (void) printf("%-30s %s\n", entry.mnt_special,
+ entry.mnt_mountp);
}
} else {
zfs_handle_t *zhp;
- types = ZFS_TYPE_FILESYSTEM;
- if (op == OP_SHARE)
- types |= ZFS_TYPE_VOLUME;
-
if (argc > 1) {
(void) fprintf(stderr,
gettext("too many arguments\n"));
usage(B_FALSE);
}
- if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
+ if ((zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM)) == NULL) {
ret = 1;
} else {
ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
@@ -3230,7 +5597,7 @@ share_mount(int op, int argc, char **argv)
}
/*
- * zfs mount -a [nfs | iscsi]
+ * zfs mount -a [nfs]
* zfs mount filesystem
*
* Mount all filesystems, or mount the given filesystem.
@@ -3242,7 +5609,7 @@ zfs_do_mount(int argc, char **argv)
}
/*
- * zfs share -a [nfs | iscsi | smb]
+ * zfs share -a [nfs | smb]
* zfs share filesystem
*
* Share all filesystems, or share the given filesystem.
@@ -3280,7 +5647,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
zfs_handle_t *zhp;
int ret;
struct stat64 statbuf;
- struct mnttab search = { 0 }, entry;
+ struct extmnttab entry;
const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
ino_t path_inode;
@@ -3300,9 +5667,26 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
/*
* Search for the given (major,minor) pair in the mount table.
*/
- search.mnt_mountp = path;
+#ifdef sun
rewind(mnttab_file);
- if (getmntany(mnttab_file, &entry, &search) != 0) {
+ while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
+ if (entry.mnt_major == major(statbuf.st_dev) &&
+ entry.mnt_minor == minor(statbuf.st_dev))
+ break;
+ }
+#else
+ {
+ struct statfs sfs;
+
+ if (statfs(path, &sfs) != 0) {
+ (void) fprintf(stderr, "%s: %s\n", path,
+ strerror(errno));
+ ret = -1;
+ }
+ statfs2mnttab(&sfs, &entry);
+ }
+#endif
+ if (ret != 0) {
if (op == OP_SHARE) {
(void) fprintf(stderr, gettext("cannot %s '%s': not "
"currently mounted\n"), cmdname, path);
@@ -3392,9 +5776,9 @@ unshare_unmount(int op, int argc, char **argv)
int do_all = 0;
int flags = 0;
int ret = 0;
- int types, c;
+ int c;
zfs_handle_t *zhp;
- char nfsiscsi_mnt_prop[ZFS_MAXPROPLEN];
+ char nfs_mnt_prop[ZFS_MAXPROPLEN];
char sharesmb[ZFS_MAXPROPLEN];
/* check options */
@@ -3431,51 +5815,37 @@ unshare_unmount(int op, int argc, char **argv)
* the special type (dataset name), and walk the result in
* reverse to make sure to get any snapshots first.
*/
+ struct mnttab entry;
uu_avl_pool_t *pool;
uu_avl_t *tree;
unshare_unmount_node_t *node;
uu_avl_index_t idx;
uu_avl_walk_t *walk;
- struct statfs *sfs;
- int i, n;
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
- if ((pool = uu_avl_pool_create("unmount_pool",
+ if (((pool = uu_avl_pool_create("unmount_pool",
sizeof (unshare_unmount_node_t),
offsetof(unshare_unmount_node_t, un_avlnode),
- unshare_unmount_compare,
- UU_DEFAULT)) == NULL) {
- (void) fprintf(stderr, gettext("internal error: "
- "out of memory\n"));
- exit(1);
- }
-
- if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
- (void) fprintf(stderr, gettext("internal error: "
- "out of memory\n"));
- exit(1);
- }
+ unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+ ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+ nomem();
- if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
- (void) fprintf(stderr, gettext("internal error: "
- "getmntinfo() failed\n"));
- exit(1);
- }
- for (i = 0; i < n; i++) {
+ rewind(mnttab_file);
+ while (getmntent(mnttab_file, &entry) == 0) {
/* ignore non-ZFS entries */
- if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
continue;
/* ignore snapshots */
- if (strchr(sfs[i].f_mntfromname, '@') != NULL)
+ if (strchr(entry.mnt_special, '@') != NULL)
continue;
- if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname,
+ if ((zhp = zfs_open(g_zfs, entry.mnt_special,
ZFS_TYPE_FILESYSTEM)) == NULL) {
ret = 1;
continue;
@@ -3484,25 +5854,25 @@ unshare_unmount(int op, int argc, char **argv)
switch (op) {
case OP_SHARE:
verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
- nfsiscsi_mnt_prop,
- sizeof (nfsiscsi_mnt_prop),
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
- if (strcmp(nfsiscsi_mnt_prop, "off") != 0)
+ if (strcmp(nfs_mnt_prop, "off") != 0)
break;
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
- nfsiscsi_mnt_prop,
- sizeof (nfsiscsi_mnt_prop),
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
- if (strcmp(nfsiscsi_mnt_prop, "off") == 0)
+ if (strcmp(nfs_mnt_prop, "off") == 0)
continue;
break;
case OP_MOUNT:
/* Ignore legacy mounts */
verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
- nfsiscsi_mnt_prop,
- sizeof (nfsiscsi_mnt_prop),
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
- if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0)
+ if (strcmp(nfs_mnt_prop, "legacy") == 0)
continue;
/* Ignore canmount=noauto mounts */
if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
@@ -3514,13 +5884,7 @@ unshare_unmount(int op, int argc, char **argv)
node = safe_malloc(sizeof (unshare_unmount_node_t));
node->un_zhp = zhp;
-
- if ((node->un_mountp = strdup(sfs[i].f_mntonname)) ==
- NULL) {
- (void) fprintf(stderr, gettext("internal error:"
- " out of memory\n"));
- exit(1);
- }
+ node->un_mountp = safe_strdup(entry.mnt_mountp);
uu_avl_node_init(node, &node->un_avlnode, pool);
@@ -3538,11 +5902,8 @@ unshare_unmount(int op, int argc, char **argv)
* removing it from the AVL tree in the process.
*/
if ((walk = uu_avl_walk_start(tree,
- UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
- (void) fprintf(stderr,
- gettext("internal error: out of memory"));
- exit(1);
- }
+ UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+ nomem();
while ((node = uu_avl_walk_next(walk)) != NULL) {
uu_avl_remove(tree, node);
@@ -3570,29 +5931,6 @@ unshare_unmount(int op, int argc, char **argv)
uu_avl_destroy(tree);
uu_avl_pool_destroy(pool);
- if (op == OP_SHARE) {
- /*
- * Finally, unshare any volumes shared via iSCSI.
- */
- zfs_handle_t **dslist = NULL;
- size_t i, count = 0;
-
- get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count,
- B_FALSE);
-
- if (count != 0) {
- qsort(dslist, count, sizeof (void *),
- dataset_cmp);
-
- for (i = 0; i < count; i++) {
- if (zfs_unshare_iscsi(dslist[i]) != 0)
- ret = 1;
- zfs_close(dslist[i]);
- }
-
- free(dslist);
- }
- }
} else {
if (argc != 1) {
if (argc == 0)
@@ -3614,91 +5952,63 @@ unshare_unmount(int op, int argc, char **argv)
return (unshare_unmount_path(op, argv[0],
flags, B_FALSE));
- types = ZFS_TYPE_FILESYSTEM;
- if (op == OP_SHARE)
- types |= ZFS_TYPE_VOLUME;
-
- if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
+ if ((zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM)) == NULL)
return (1);
- if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
- verify(zfs_prop_get(zhp, op == OP_SHARE ?
- ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
- nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop), NULL,
- NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, op == OP_SHARE ?
+ ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+ nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+ NULL, 0, B_FALSE) == 0);
- switch (op) {
- case OP_SHARE:
- verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
- nfsiscsi_mnt_prop,
- sizeof (nfsiscsi_mnt_prop),
- NULL, NULL, 0, B_FALSE) == 0);
- verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
- sharesmb, sizeof (sharesmb), NULL, NULL,
- 0, B_FALSE) == 0);
-
- if (strcmp(nfsiscsi_mnt_prop, "off") == 0 &&
- strcmp(sharesmb, "off") == 0) {
- (void) fprintf(stderr, gettext("cannot "
- "unshare '%s': legacy share\n"),
- zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("use "
- "unshare(1M) to unshare this "
- "filesystem\n"));
- ret = 1;
- } else if (!zfs_is_shared(zhp)) {
- (void) fprintf(stderr, gettext("cannot "
- "unshare '%s': not currently "
- "shared\n"), zfs_get_name(zhp));
- ret = 1;
- } else if (zfs_unshareall(zhp) != 0) {
- ret = 1;
- }
- break;
-
- case OP_MOUNT:
- if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0) {
- (void) fprintf(stderr, gettext("cannot "
- "unmount '%s': legacy "
- "mountpoint\n"), zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("use "
- "umount(1M) to unmount this "
- "filesystem\n"));
- ret = 1;
- } else if (!zfs_is_mounted(zhp, NULL)) {
- (void) fprintf(stderr, gettext("cannot "
- "unmount '%s': not currently "
- "mounted\n"),
- zfs_get_name(zhp));
- ret = 1;
- } else if (zfs_unmountall(zhp, flags) != 0) {
- ret = 1;
- }
- break;
- }
- } else {
- assert(op == OP_SHARE);
-
- verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI,
- nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop),
+ switch (op) {
+ case OP_SHARE:
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+ sharesmb, sizeof (sharesmb), NULL, NULL,
+ 0, B_FALSE) == 0);
- if (strcmp(nfsiscsi_mnt_prop, "off") == 0) {
- (void) fprintf(stderr, gettext("cannot unshare "
- "'%s': 'shareiscsi' property not set\n"),
+ if (strcmp(nfs_mnt_prop, "off") == 0 &&
+ strcmp(sharesmb, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': legacy share\n"),
zfs_get_name(zhp));
- (void) fprintf(stderr, gettext("set "
- "'shareiscsi' property or use "
- "iscsitadm(1M) to share this volume\n"));
+ (void) fprintf(stderr, gettext("use "
+ "unshare(1M) to unshare this "
+ "filesystem\n"));
ret = 1;
- } else if (!zfs_is_shared_iscsi(zhp)) {
+ } else if (!zfs_is_shared(zhp)) {
(void) fprintf(stderr, gettext("cannot "
- "unshare '%s': not currently shared\n"),
+ "unshare '%s': not currently "
+ "shared\n"), zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unshareall(zhp) != 0) {
+ ret = 1;
+ }
+ break;
+
+ case OP_MOUNT:
+ if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': legacy "
+ "mountpoint\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use "
+ "umount(1M) to unmount this "
+ "filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_mounted(zhp, NULL)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': not currently "
+ "mounted\n"),
zfs_get_name(zhp));
ret = 1;
- } else if (zfs_unshare_iscsi(zhp) != 0) {
+ } else if (zfs_unmountall(zhp, flags) != 0) {
ret = 1;
}
+ break;
}
zfs_close(zhp);
@@ -3793,16 +6103,6 @@ zfs_do_unjail(int argc, char **argv)
return (do_jail(argc, argv, 0));
}
-/* ARGSUSED */
-static int
-zfs_do_python(int argc, char **argv)
-{
- (void) execv(pypath, argv-1);
- (void) fprintf(stderr, "internal error: %s not found\n", pypath);
- (void) fprintf(stderr, " install sysutils/py-zfs port to correct this\n");
- return (-1);
-}
-
/*
* Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
* 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
@@ -3825,14 +6125,10 @@ manual_mount(int argc, char **argv)
(void) strlcpy(mntopts, optarg, sizeof (mntopts));
break;
case 'O':
-#if 0 /* FreeBSD: No support for MS_OVERLAY. */
flags |= MS_OVERLAY;
-#endif
break;
case 'm':
-#if 0 /* FreeBSD: No support for MS_NOMNTTAB. */
flags |= MS_NOMNTTAB;
-#endif
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
@@ -3943,27 +6239,6 @@ manual_unmount(int argc, char **argv)
}
static int
-volcheck(zpool_handle_t *zhp, void *data)
-{
- boolean_t isinit = *((boolean_t *)data);
-
- if (isinit)
- return (zpool_create_zvol_links(zhp));
- else
- return (zpool_remove_zvol_links(zhp));
-}
-
-/*
- * Iterate over all pools in the system and either create or destroy /dev/zvol
- * links, depending on the value of 'isinit'.
- */
-static int
-do_volcheck(boolean_t isinit)
-{
- return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
-}
-
-static int
find_command_idx(char *command, int *idx)
{
int i;
@@ -3980,6 +6255,81 @@ find_command_idx(char *command, int *idx)
return (1);
}
+static int
+zfs_do_diff(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int flags = 0;
+ char *tosnap = NULL;
+ char *fromsnap = NULL;
+ char *atp, *copy;
+ int err;
+ int c;
+
+ while ((c = getopt(argc, argv, "FHt")) != -1) {
+ switch (c) {
+ case 'F':
+ flags |= ZFS_DIFF_CLASSIFY;
+ break;
+ case 'H':
+ flags |= ZFS_DIFF_PARSEABLE;
+ break;
+ case 't':
+ flags |= ZFS_DIFF_TIMESTAMP;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr,
+ gettext("must provide at least one snapshot name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ fromsnap = argv[0];
+ tosnap = (argc == 2) ? argv[1] : NULL;
+
+ copy = NULL;
+ if (*fromsnap != '@')
+ copy = strdup(fromsnap);
+ else if (tosnap)
+ copy = strdup(tosnap);
+ if (copy == NULL)
+ usage(B_FALSE);
+
+ if (atp = strchr(copy, '@'))
+ *atp = '\0';
+
+ if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
+ return (1);
+
+ free(copy);
+
+ /*
+ * Ignore SIGPIPE so that the library can give us
+ * information on any failure
+ */
+ (void) sigignore(SIGPIPE);
+
+ err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+
+ zfs_close(zhp);
+
+ return (err != 0);
+}
+
int
main(int argc, char **argv)
{
@@ -4049,15 +6399,6 @@ main(int argc, char **argv)
usage(B_TRUE);
/*
- * 'volinit' and 'volfini' do not appear in the usage message,
- * so we have to special case them here.
- */
- if (strcmp(cmdname, "volinit") == 0)
- return (do_volcheck(B_TRUE));
- else if (strcmp(cmdname, "volfini") == 0)
- return (do_volcheck(B_FALSE));
-
- /*
* Run the appropriate command.
*/
libzfs_mnttab_cache(g_zfs, B_TRUE);
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h b/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
index c7f2f1618647..3ddff9e22d7d 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZFS_UTIL_H
#define _ZFS_UTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <libzfs.h>
#ifdef __cplusplus
@@ -35,6 +32,7 @@ extern "C" {
#endif
void * safe_malloc(size_t size);
+void nomem(void);
libzfs_handle_t *g_zfs;
#ifdef __cplusplus
diff --git a/cddl/contrib/opensolaris/cmd/zinject/translate.c b/cddl/contrib/opensolaris/cmd/zinject/translate.c
index da26cd633302..442f220c442a 100644
--- a/cddl/contrib/opensolaris/cmd/zinject/translate.c
+++ b/cddl/contrib/opensolaris/cmd/zinject/translate.c
@@ -19,14 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <libzfs.h>
-#undef verify /* both libzfs.h and zfs_context.h want to define this */
-
#include <sys/zfs_context.h>
#include <errno.h>
@@ -49,9 +46,6 @@
#include "zinject.h"
-#include <assert.h>
-#define verify assert
-
extern void kernel_init(int);
extern void kernel_fini(void);
@@ -70,6 +64,18 @@ ziprintf(const char *fmt, ...)
va_end(ap);
}
+static void
+compress_slashes(const char *src, char *dest)
+{
+ while (*src != '\0') {
+ *dest = *src++;
+ while (*dest == '/' && *src == '/')
+ ++src;
+ ++dest;
+ }
+ *dest = '\0';
+}
+
/*
* Given a full path to a file, translate into a dataset name and a relative
* path within the dataset. 'dataset' must be at least MAXNAMELEN characters,
@@ -77,11 +83,14 @@ ziprintf(const char *fmt, ...)
* buffer, which we need later to get the object ID.
*/
static int
-parse_pathname(const char *fullpath, char *dataset, char *relpath,
+parse_pathname(const char *inpath, char *dataset, char *relpath,
struct stat64 *statbuf)
{
struct statfs sfs;
const char *rel;
+ char fullpath[MAXPATHLEN];
+
+ compress_slashes(inpath, fullpath);
if (fullpath[0] != '/') {
(void) fprintf(stderr, "invalid object '%s': must be full "
@@ -148,8 +157,8 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
*/
sync();
- if ((err = dmu_objset_open(dataset, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+ err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os);
+ if (err != 0) {
(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
dataset, strerror(err));
return (-1);
@@ -158,7 +167,7 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
record->zi_objset = dmu_objset_id(os);
record->zi_object = statbuf->st_ino;
- dmu_objset_close(os);
+ dmu_objset_disown(os, FTAG);
return (0);
}
@@ -233,17 +242,17 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
* Get the dnode associated with object, so we can calculate the block
* size.
*/
- if ((err = dmu_objset_open(dataset, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+ if ((err = dmu_objset_own(dataset, DMU_OST_ANY,
+ B_TRUE, FTAG, &os)) != 0) {
(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
dataset, strerror(err));
goto out;
}
if (record->zi_object == 0) {
- dn = os->os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
} else {
- err = dnode_hold(os->os, record->zi_object, FTAG, &dn);
+ err = dnode_hold(os, record->zi_object, FTAG, &dn);
if (err != 0) {
(void) fprintf(stderr, "failed to hold dnode "
"for object %llu\n",
@@ -292,11 +301,11 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
ret = 0;
out:
if (dn) {
- if (dn != os->os->os_meta_dnode)
+ if (dn != DMU_META_DNODE(os))
dnode_rele(dn, FTAG);
}
if (os)
- dmu_objset_close(os);
+ dmu_objset_disown(os, FTAG);
return (ret);
}
@@ -333,8 +342,8 @@ translate_record(err_type_t type, const char *object, const char *range,
case TYPE_CONFIG:
record->zi_type = DMU_OT_PACKED_NVLIST;
break;
- case TYPE_BPLIST:
- record->zi_type = DMU_OT_BPLIST;
+ case TYPE_BPOBJ:
+ record->zi_type = DMU_OT_BPOBJ;
break;
case TYPE_SPACEMAP:
record->zi_type = DMU_OT_SPACE_MAP;
@@ -455,6 +464,14 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
break;
+ case TYPE_LABEL_PAD1:
+ record->zi_start = offsetof(vdev_label_t, vl_pad1);
+ record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+ break;
+ case TYPE_LABEL_PAD2:
+ record->zi_start = offsetof(vdev_label_t, vl_pad2);
+ record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+ break;
}
return (0);
}
diff --git a/cddl/contrib/opensolaris/cmd/zinject/zinject.c b/cddl/contrib/opensolaris/cmd/zinject/zinject.c
index e8327e8dcdf5..51d2fc97ccd0 100644
--- a/cddl/contrib/opensolaris/cmd/zinject/zinject.c
+++ b/cddl/contrib/opensolaris/cmd/zinject/zinject.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -42,12 +41,12 @@
* any attempt to read from the device will return EIO, but any attempt to
* reopen the device will also return ENXIO.
* For label faults, the -L option must be specified. This allows faults
- * to be injected into either the nvlist or uberblock region of all the labels
- * for the specified device.
+ * to be injected into either the nvlist, uberblock, pad1, or pad2 region
+ * of all the labels for the specified device.
*
* This form of the command looks like:
*
- * zinject -d device [-e errno] [-L <uber | nvlist>] pool
+ * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
*
*
* DATA FAULTS
@@ -70,7 +69,7 @@
* mos Any data in the MOS
* mosdir object directory
* config pool configuration
- * bplist blkptr list
+ * bpobj blkptr list
* spacemap spacemap
* metaslab metaslab
* errlog persistent error log
@@ -167,11 +166,13 @@ static const char *errtable[TYPE_INVAL] = {
"mosdir",
"metaslab",
"config",
- "bplist",
+ "bpobj",
"spacemap",
"errlog",
"uber",
- "nvlist"
+ "nvlist",
+ "pad1",
+ "pad2"
};
static err_type_t
@@ -195,8 +196,8 @@ type_to_name(uint64_t type)
return ("metaslab");
case DMU_OT_PACKED_NVLIST:
return ("config");
- case DMU_OT_BPLIST:
- return ("bplist");
+ case DMU_OT_BPOBJ:
+ return ("bpobj");
case DMU_OT_SPACE_MAP:
return ("spacemap");
case DMU_OT_ERROR_LOG:
@@ -225,10 +226,27 @@ usage(void)
"\t\tClear the particular record (if given a numeric ID), or\n"
"\t\tall records if 'all' is specificed.\n"
"\n"
- "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
+ "\tzinject -p <function name> pool\n"
+ "\t\tInject a panic fault at the specified function. Only \n"
+ "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+ "\t\tspa_vdev_exit() will trigger a panic.\n"
+ "\n"
+ "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
+ "\t [-T <read|write|free|claim|all> pool\n"
"\t\tInject a fault into a particular device or the device's\n"
- "\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n"
- "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
+ "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
+ "\t\t'pad1', or 'pad2'.\n"
+ "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
+ "\n"
+ "\tzinject -d device -A <degrade|fault> pool\n"
+ "\t\tPerform a specific action on a particular device\n"
+ "\n"
+ "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+ "\t\tCause the pool to stop writing blocks yet not\n"
+ "\t\treport errors for a duration. Simulates buggy hardware\n"
+ "\t\tthat fails to honor cache flush requests.\n"
+ "\t\tDefault duration is 30 seconds. The machine is panicked\n"
+ "\t\tat the end of the duration.\n"
"\n"
"\tzinject -b objset:object:level:blkid pool\n"
"\n"
@@ -270,7 +288,7 @@ usage(void)
"\t\t\ton a ZFS filesystem.\n"
"\n"
"\t-t <mos>\tInject errors into the MOS for objects of the given\n"
- "\t\t\ttype. Valid types are: mos, mosdir, config, bplist,\n"
+ "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n"
"\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n"
"\t\t\tthe poolname.\n");
}
@@ -289,6 +307,12 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
&zc.zc_inject_record, data)) != 0)
return (ret);
+ if (errno != ENOENT) {
+ (void) fprintf(stderr, "Unable to list handlers: %s\n",
+ strerror(errno));
+ return (-1);
+ }
+
return (0);
}
@@ -298,7 +322,7 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;
- if (record->zi_guid != 0)
+ if (record->zi_guid != 0 || record->zi_func[0] != '\0')
return (0);
if (*count == 0) {
@@ -330,7 +354,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;
- if (record->zi_guid == 0)
+ if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);
if (*count == 0) {
@@ -346,6 +370,27 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}
+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_func[0] == '\0')
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION");
+ (void) printf("--- --------------- ----------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %s\n", id, pool, record->zi_func);
+
+ return (0);
+}
+
/*
* Print all registered error handlers. Returns the number of handlers
* registered.
@@ -353,14 +398,25 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
static int
print_all_handlers(void)
{
- int count = 0;
+ int count = 0, total = 0;
(void) iter_handlers(print_device_handler, &count);
- (void) printf("\n");
- count = 0;
+ if (count > 0) {
+ total += count;
+ (void) printf("\n");
+ count = 0;
+ }
+
(void) iter_handlers(print_data_handler, &count);
+ if (count > 0) {
+ total += count;
+ (void) printf("\n");
+ count = 0;
+ }
+
+ (void) iter_handlers(print_panic_handler, &count);
- return (count);
+ return (count + total);
}
/* ARGSUSED */
@@ -389,7 +445,8 @@ cancel_all_handlers(void)
{
int ret = iter_handlers(cancel_one_handler, NULL);
- (void) printf("removed all registered handlers\n");
+ if (ret == 0)
+ (void) printf("removed all registered handlers\n");
return (ret);
}
@@ -446,6 +503,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
if (record->zi_guid) {
(void) printf(" vdev: %llx\n",
(u_longlong_t)record->zi_guid);
+ } else if (record->zi_func[0] != '\0') {
+ (void) printf(" panic function: %s\n",
+ record->zi_func);
+ } else if (record->zi_duration > 0) {
+ (void) printf(" time: %lld seconds\n",
+ (u_longlong_t)record->zi_duration);
+ } else if (record->zi_duration < 0) {
+ (void) printf(" txgs: %lld \n",
+ (u_longlong_t)-record->zi_duration);
} else {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
@@ -468,6 +534,22 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
}
int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+ zfs_cmd_t zc;
+
+ ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+ (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+ zc.zc_guid = record->zi_guid;
+ zc.zc_cookie = cmd;
+
+ if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ return (0);
+
+ return (1);
+}
+
+int
main(int argc, char **argv)
{
int c;
@@ -480,12 +562,17 @@ main(int argc, char **argv)
int quiet = 0;
int error = 0;
int domount = 0;
+ int io_type = ZIO_TYPES;
+ int action = VDEV_STATE_UNKNOWN;
err_type_t type = TYPE_INVAL;
err_type_t label = TYPE_INVAL;
zinject_record_t record = { 0 };
char pool[MAXNAMELEN];
char dataset[MAXNAMELEN];
zfs_handle_t *zhp;
+ int nowrites = 0;
+ int dur_txg = 0;
+ int dur_secs = 0;
int ret;
int flags = 0;
@@ -517,11 +604,24 @@ main(int argc, char **argv)
return (0);
}
- while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
+ while ((c = getopt(argc, argv,
+ ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
break;
+ case 'A':
+ if (strcasecmp(optarg, "degrade") == 0) {
+ action = VDEV_STATE_DEGRADED;
+ } else if (strcasecmp(optarg, "fault") == 0) {
+ action = VDEV_STATE_FAULTED;
+ } else {
+ (void) fprintf(stderr, "invalid action '%s': "
+ "must be 'degrade' or 'fault'\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
case 'b':
raw = optarg;
break;
@@ -538,6 +638,8 @@ main(int argc, char **argv)
error = ECKSUM;
} else if (strcasecmp(optarg, "nxio") == 0) {
error = ENXIO;
+ } else if (strcasecmp(optarg, "dtl") == 0) {
+ error = ECHILD;
} else {
(void) fprintf(stderr, "invalid error type "
"'%s': must be 'io', 'checksum' or "
@@ -557,9 +659,27 @@ main(int argc, char **argv)
case 'F':
record.zi_failfast = B_TRUE;
break;
+ case 'g':
+ dur_txg = 1;
+ record.zi_duration = (int)strtol(optarg, &end, 10);
+ if (record.zi_duration <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid duration '%s': "
+ "must be a positive integer\n", optarg);
+ usage();
+ return (1);
+ }
+ /* store duration of txgs as its negative */
+ record.zi_duration *= -1;
+ break;
case 'h':
usage();
return (0);
+ case 'I':
+ /* default duration, if one hasn't yet been defined */
+ nowrites = 1;
+ if (dur_secs == 0 && dur_txg == 0)
+ record.zi_duration = 30;
+ break;
case 'l':
level = (int)strtol(optarg, &end, 10);
if (*end != '\0') {
@@ -572,12 +692,45 @@ main(int argc, char **argv)
case 'm':
domount = 1;
break;
+ case 'p':
+ (void) strlcpy(record.zi_func, optarg,
+ sizeof (record.zi_func));
+ break;
case 'q':
quiet = 1;
break;
case 'r':
range = optarg;
break;
+ case 's':
+ dur_secs = 1;
+ record.zi_duration = (int)strtol(optarg, &end, 10);
+ if (record.zi_duration <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid duration '%s': "
+ "must be a positive integer\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
+ case 'T':
+ if (strcasecmp(optarg, "read") == 0) {
+ io_type = ZIO_TYPE_READ;
+ } else if (strcasecmp(optarg, "write") == 0) {
+ io_type = ZIO_TYPE_WRITE;
+ } else if (strcasecmp(optarg, "free") == 0) {
+ io_type = ZIO_TYPE_FREE;
+ } else if (strcasecmp(optarg, "claim") == 0) {
+ io_type = ZIO_TYPE_CLAIM;
+ } else if (strcasecmp(optarg, "all") == 0) {
+ io_type = ZIO_TYPES;
+ } else {
+ (void) fprintf(stderr, "invalid I/O type "
+ "'%s': must be 'read', 'write', 'free', "
+ "'claim' or 'all'\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
case 't':
if ((type = name_to_type(optarg)) == TYPE_INVAL &&
!MOS_TYPE(type)) {
@@ -620,7 +773,8 @@ main(int argc, char **argv)
* '-c' is invalid with any other options.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0) {
+ level != 0 || record.zi_func[0] != '\0' ||
+ record.zi_duration != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -652,7 +806,8 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0) {
+ level != 0 || record.zi_func[0] != '\0' ||
+ record.zi_duration != 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -675,12 +830,18 @@ main(int argc, char **argv)
return (1);
}
+ record.zi_iotype = io_type;
if (translate_device(pool, device, label, &record) != 0)
return (1);
if (!error)
error = ENXIO;
+
+ if (action != VDEV_STATE_UNKNOWN)
+ return (perform_action(pool, &record, action));
+
} else if (raw != NULL) {
- if (range != NULL || type != TYPE_INVAL || level != 0) {
+ if (range != NULL || type != TYPE_INVAL || level != 0 ||
+ record.zi_func[0] != '\0' || record.zi_duration != 0) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@@ -707,10 +868,52 @@ main(int argc, char **argv)
return (1);
if (!error)
error = EIO;
+ } else if (record.zi_func[0] != '\0') {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || device != NULL || record.zi_duration != 0) {
+ (void) fprintf(stderr, "panic (-p) incompatible with "
+ "other options\n");
+ usage();
+ return (2);
+ }
+
+ if (argc < 1 || argc > 2) {
+ (void) fprintf(stderr, "panic (-p) injection requires "
+ "a single pool name and an optional id\n");
+ usage();
+ return (2);
+ }
+
+ (void) strcpy(pool, argv[0]);
+ if (argv[1] != NULL)
+ record.zi_type = atoi(argv[1]);
+ dataset[0] = '\0';
+ } else if (record.zi_duration != 0) {
+ if (nowrites == 0) {
+ (void) fprintf(stderr, "-s or -g meaningless "
+ "without -I (ignore writes)\n");
+ usage();
+ return (2);
+ } else if (dur_secs && dur_txg) {
+ (void) fprintf(stderr, "choose a duration either "
+ "in seconds (-s) or a number of txgs (-g) "
+ "but not both\n");
+ usage();
+ return (2);
+ } else if (argc != 1) {
+ (void) fprintf(stderr, "ignore writes (-I) "
+ "injection requires a single pool name\n");
+ usage();
+ return (2);
+ }
+
+ (void) strcpy(pool, argv[0]);
+ dataset[0] = '\0';
} else if (type == TYPE_INVAL) {
if (flags == 0) {
(void) fprintf(stderr, "at least one of '-b', '-d', "
- "'-t', '-a', or '-u' must be specified\n");
+ "'-t', '-a', '-p', '-I' or '-u' "
+ "must be specified\n");
usage();
return (2);
}
diff --git a/cddl/contrib/opensolaris/cmd/zinject/zinject.h b/cddl/contrib/opensolaris/cmd/zinject/zinject.h
index adc3efe80400..46fdcad8b31f 100644
--- a/cddl/contrib/opensolaris/cmd/zinject/zinject.h
+++ b/cddl/contrib/opensolaris/cmd/zinject/zinject.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZINJECT_H
#define _ZINJECT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_ioctl.h>
#ifdef __cplusplus
@@ -41,11 +38,13 @@ typedef enum {
TYPE_MOSDIR, /* MOS object directory */
TYPE_METASLAB, /* metaslab objects */
TYPE_CONFIG, /* MOS config */
- TYPE_BPLIST, /* block pointer list */
+ TYPE_BPOBJ, /* block pointer list */
TYPE_SPACEMAP, /* space map objects */
TYPE_ERRLOG, /* persistent error log */
TYPE_LABEL_UBERBLOCK, /* label specific uberblock */
TYPE_LABEL_NVLIST, /* label specific nvlist */
+ TYPE_LABEL_PAD1, /* label specific 8K pad1 area */
+ TYPE_LABEL_PAD2, /* label specific 8K pad2 area */
TYPE_INVAL
} err_type_t;
diff --git a/cddl/contrib/opensolaris/cmd/zlook/zlook.c b/cddl/contrib/opensolaris/cmd/zlook/zlook.c
new file mode 100644
index 000000000000..29a6559f9023
--- /dev/null
+++ b/cddl/contrib/opensolaris/cmd/zlook/zlook.c
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * This is a test program that uses ioctls to the ZFS Unit Test driver
+ * to perform readdirs or lookups using flags not normally available
+ * to user-land programs. This allows testing of the flags'
+ * behavior outside of a complicated consumer, such as the SMB driver.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/dirent.h>
+#include <sys/attr.h>
+#include <stddef.h>
+#include <fcntl.h>
+#include <string.h>
+#include <time.h>
+
+#define _KERNEL
+
+#include <sys/fs/zut.h>
+#include <sys/extdirent.h>
+
+#undef _KERNEL
+
+#define MAXBUF (64 * 1024)
+#define BIGBUF 4096
+#define LILBUF (sizeof (dirent_t))
+
+#define DIRENT_NAMELEN(reclen) \
+ ((reclen) - (offsetof(dirent_t, d_name[0])))
+
+static void
+usage(char *pnam)
+{
+ (void) fprintf(stderr, "Usage:\n %s -l [-is] dir-to-look-in "
+ "file-in-dir [xfile-on-file]\n", pnam);
+ (void) fprintf(stderr, " %s -i [-ls] dir-to-look-in "
+ "file-in-dir [xfile-on-file]\n", pnam);
+ (void) fprintf(stderr, " %s -s [-il] dir-to-look-in "
+ "file-in-dir [xfile-on-file]\n", pnam);
+ (void) fprintf(stderr, "\t Perform a lookup\n");
+ (void) fprintf(stderr, "\t -l == lookup\n");
+ (void) fprintf(stderr, "\t -i == request FIGNORECASE\n");
+ (void) fprintf(stderr, "\t -s == request stat(2) and xvattr info\n");
+ (void) fprintf(stderr, " %s -r [-ea] [-b buffer-size-in-bytes] "
+ "dir-to-look-in [file-in-dir]\n", pnam);
+ (void) fprintf(stderr, " %s -e [-ra] [-b buffer-size-in-bytes] "
+ "dir-to-look-in [file-in-dir]\n", pnam);
+ (void) fprintf(stderr, " %s -a [-re] [-b buffer-size-in-bytes] "
+ "dir-to-look-in [file-in-dir]\n", pnam);
+ (void) fprintf(stderr, "\t Perform a readdir\n");
+ (void) fprintf(stderr, "\t -r == readdir\n");
+ (void) fprintf(stderr, "\t -e == request extended entries\n");
+ (void) fprintf(stderr, "\t -a == request access filtering\n");
+ (void) fprintf(stderr, "\t -b == buffer size (default 4K)\n");
+ (void) fprintf(stderr, " %s -A path\n", pnam);
+ (void) fprintf(stderr, "\t Look up _PC_ACCESS_FILTERING "
+ "for path with pathconf(2)\n");
+ (void) fprintf(stderr, " %s -E path\n", pnam);
+ (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS "
+ "for path with pathconf(2)\n");
+ (void) fprintf(stderr, " %s -S path\n", pnam);
+ (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS "
+ "for path with pathconf(2)\n");
+ exit(EINVAL);
+}
+
+static void
+print_extd_entries(zut_readdir_t *r)
+{
+ struct edirent *eodp;
+ char *bufstart;
+
+ eodp = (edirent_t *)(uintptr_t)r->zr_buf;
+ bufstart = (char *)eodp;
+ while ((char *)eodp < bufstart + r->zr_bytes) {
+ char *blanks = " ";
+ int i = 0;
+ while (i < EDIRENT_NAMELEN(eodp->ed_reclen)) {
+ if (!eodp->ed_name[i])
+ break;
+ (void) printf("%c", eodp->ed_name[i++]);
+ }
+ if (i < 16)
+ (void) printf("%.*s", 16 - i, blanks);
+ (void) printf("\t%x\n", eodp->ed_eflags);
+ eodp = (edirent_t *)((intptr_t)eodp + eodp->ed_reclen);
+ }
+}
+
+static void
+print_entries(zut_readdir_t *r)
+{
+ dirent64_t *dp;
+ char *bufstart;
+
+ dp = (dirent64_t *)(intptr_t)r->zr_buf;
+ bufstart = (char *)dp;
+ while ((char *)dp < bufstart + r->zr_bytes) {
+ int i = 0;
+ while (i < DIRENT_NAMELEN(dp->d_reclen)) {
+ if (!dp->d_name[i])
+ break;
+ (void) printf("%c", dp->d_name[i++]);
+ }
+ (void) printf("\n");
+ dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
+ }
+}
+
+static void
+print_stats(struct stat64 *sb)
+{
+ char timebuf[512];
+
+ (void) printf("st_mode\t\t\t%04lo\n", (unsigned long)sb->st_mode);
+ (void) printf("st_ino\t\t\t%llu\n", (unsigned long long)sb->st_ino);
+ (void) printf("st_nlink\t\t%lu\n", (unsigned long)sb->st_nlink);
+ (void) printf("st_uid\t\t\t%d\n", sb->st_uid);
+ (void) printf("st_gid\t\t\t%d\n", sb->st_gid);
+ (void) printf("st_size\t\t\t%lld\n", (long long)sb->st_size);
+ (void) printf("st_blksize\t\t%ld\n", (long)sb->st_blksize);
+ (void) printf("st_blocks\t\t%lld\n", (long long)sb->st_blocks);
+
+ timebuf[0] = 0;
+ if (ctime_r(&sb->st_atime, timebuf, 512)) {
+ (void) printf("st_atime\t\t");
+ (void) printf("%s", timebuf);
+ }
+ timebuf[0] = 0;
+ if (ctime_r(&sb->st_mtime, timebuf, 512)) {
+ (void) printf("st_mtime\t\t");
+ (void) printf("%s", timebuf);
+ }
+ timebuf[0] = 0;
+ if (ctime_r(&sb->st_ctime, timebuf, 512)) {
+ (void) printf("st_ctime\t\t");
+ (void) printf("%s", timebuf);
+ }
+}
+
+static void
+print_xvs(uint64_t xvs)
+{
+ uint_t bits;
+ int idx = 0;
+
+ if (xvs == 0)
+ return;
+
+ (void) printf("-------------------\n");
+ (void) printf("Attribute bit(s) set:\n");
+ (void) printf("-------------------\n");
+
+ bits = xvs & ((1 << F_ATTR_ALL) - 1);
+ while (bits) {
+ uint_t rest = bits >> 1;
+ if (bits & 1) {
+ (void) printf("%s", attr_to_name((f_attr_t)idx));
+ if (rest)
+ (void) printf(", ");
+ }
+ idx++;
+ bits = rest;
+ }
+ (void) printf("\n");
+}
+
+int
+main(int argc, char **argv)
+{
+ zut_lookup_t lk = {0};
+ zut_readdir_t rd = {0};
+ boolean_t checking = B_FALSE;
+ boolean_t looking = B_FALSE;
+ boolean_t reading = B_FALSE;
+ boolean_t bflag = B_FALSE;
+ long rddir_bufsize = BIGBUF;
+ int error = 0;
+ int check;
+ int fd;
+ int c;
+
+ while ((c = getopt(argc, argv, "lisaerb:ASE")) != -1) {
+ switch (c) {
+ case 'l':
+ looking = B_TRUE;
+ break;
+ case 'i':
+ lk.zl_reqflags |= ZUT_IGNORECASE;
+ looking = B_TRUE;
+ break;
+ case 's':
+ lk.zl_reqflags |= ZUT_GETSTAT;
+ looking = B_TRUE;
+ break;
+ case 'a':
+ rd.zr_reqflags |= ZUT_ACCFILTER;
+ reading = B_TRUE;
+ break;
+ case 'e':
+ rd.zr_reqflags |= ZUT_EXTRDDIR;
+ reading = B_TRUE;
+ break;
+ case 'r':
+ reading = B_TRUE;
+ break;
+ case 'b':
+ reading = B_TRUE;
+ bflag = B_TRUE;
+ rddir_bufsize = strtol(optarg, NULL, 0);
+ break;
+ case 'A':
+ checking = B_TRUE;
+ check = _PC_ACCESS_FILTERING;
+ break;
+ case 'S':
+ checking = B_TRUE;
+ check = _PC_SATTR_ENABLED;
+ break;
+ case 'E':
+ checking = B_TRUE;
+ check = _PC_SATTR_EXISTS;
+ break;
+ case '?':
+ default:
+ usage(argv[0]); /* no return */
+ }
+ }
+
+ if ((checking && looking) || (checking && reading) ||
+ (looking && reading) || (!reading && bflag) ||
+ (!checking && !reading && !looking))
+ usage(argv[0]); /* no return */
+
+ if (rddir_bufsize < LILBUF || rddir_bufsize > MAXBUF) {
+ (void) fprintf(stderr, "Sorry, buffer size "
+ "must be >= %d and less than or equal to %d bytes.\n",
+ (int)LILBUF, MAXBUF);
+ exit(EINVAL);
+ }
+
+ if (checking) {
+ char pathbuf[MAXPATHLEN];
+ long result;
+
+ if (argc - optind < 1)
+ usage(argv[0]); /* no return */
+ (void) strlcpy(pathbuf, argv[optind], MAXPATHLEN);
+ result = pathconf(pathbuf, check);
+ (void) printf("pathconf(2) check for %s\n", pathbuf);
+ switch (check) {
+ case _PC_SATTR_ENABLED:
+ (void) printf("System attributes ");
+ if (result != 0)
+ (void) printf("Enabled\n");
+ else
+ (void) printf("Not enabled\n");
+ break;
+ case _PC_SATTR_EXISTS:
+ (void) printf("System attributes ");
+ if (result != 0)
+ (void) printf("Exist\n");
+ else
+ (void) printf("Do not exist\n");
+ break;
+ case _PC_ACCESS_FILTERING:
+ (void) printf("Access filtering ");
+ if (result != 0)
+ (void) printf("Available\n");
+ else
+ (void) printf("Not available\n");
+ break;
+ }
+ return (result);
+ }
+
+ if ((fd = open(ZUT_DEV, O_RDONLY)) < 0) {
+ perror(ZUT_DEV);
+ return (ENXIO);
+ }
+
+ if (reading) {
+ char *buf;
+
+ if (argc - optind < 1)
+ usage(argv[0]); /* no return */
+
+ (void) strlcpy(rd.zr_dir, argv[optind], MAXPATHLEN);
+ if (argc - optind > 1) {
+ (void) strlcpy(rd.zr_file, argv[optind + 1],
+ MAXNAMELEN);
+ rd.zr_reqflags |= ZUT_XATTR;
+ }
+
+ if ((buf = malloc(rddir_bufsize)) == NULL) {
+ error = errno;
+ perror("malloc");
+ (void) close(fd);
+ return (error);
+ }
+
+ rd.zr_buf = (uint64_t)(uintptr_t)buf;
+ rd.zr_buflen = rddir_bufsize;
+
+ while (!rd.zr_eof) {
+ int ierr;
+
+ if ((ierr = ioctl(fd, ZUT_IOC_READDIR, &rd)) != 0) {
+ (void) fprintf(stderr,
+ "IOCTL error: %s (%d)\n",
+ strerror(ierr), ierr);
+ free(buf);
+ (void) close(fd);
+ return (ierr);
+ }
+ if (rd.zr_retcode) {
+ (void) fprintf(stderr,
+ "readdir result: %s (%d)\n",
+ strerror(rd.zr_retcode), rd.zr_retcode);
+ free(buf);
+ (void) close(fd);
+ return (rd.zr_retcode);
+ }
+ if (rd.zr_reqflags & ZUT_EXTRDDIR)
+ print_extd_entries(&rd);
+ else
+ print_entries(&rd);
+ }
+ free(buf);
+ } else {
+ int ierr;
+
+ if (argc - optind < 2)
+ usage(argv[0]); /* no return */
+
+ (void) strlcpy(lk.zl_dir, argv[optind], MAXPATHLEN);
+ (void) strlcpy(lk.zl_file, argv[optind + 1], MAXNAMELEN);
+ if (argc - optind > 2) {
+ (void) strlcpy(lk.zl_xfile,
+ argv[optind + 2], MAXNAMELEN);
+ lk.zl_reqflags |= ZUT_XATTR;
+ }
+
+ if ((ierr = ioctl(fd, ZUT_IOC_LOOKUP, &lk)) != 0) {
+ (void) fprintf(stderr,
+ "IOCTL error: %s (%d)\n",
+ strerror(ierr), ierr);
+ (void) close(fd);
+ return (ierr);
+ }
+
+ (void) printf("\nLookup of ");
+ if (lk.zl_reqflags & ZUT_XATTR) {
+ (void) printf("extended attribute \"%s\" of ",
+ lk.zl_xfile);
+ }
+ (void) printf("file \"%s\" ", lk.zl_file);
+ (void) printf("in directory \"%s\" ", lk.zl_dir);
+ if (lk.zl_retcode) {
+ (void) printf("failed: %s (%d)\n",
+ strerror(lk.zl_retcode), lk.zl_retcode);
+ (void) close(fd);
+ return (lk.zl_retcode);
+ }
+
+ (void) printf("succeeded.\n");
+ if (lk.zl_reqflags & ZUT_IGNORECASE) {
+ (void) printf("----------------------------\n");
+ (void) printf("dirent flags: 0x%0x\n", lk.zl_deflags);
+ (void) printf("real name: %s\n", lk.zl_real);
+ }
+ if (lk.zl_reqflags & ZUT_GETSTAT) {
+ (void) printf("----------------------------\n");
+ print_stats(&lk.zl_statbuf);
+ print_xvs(lk.zl_xvattrs);
+ }
+ }
+
+ (void) close(fd);
+ return (0);
+}
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 b/cddl/contrib/opensolaris/cmd/zpool/zpool.8
index b6c97c1a5eff..ff71dff16c32 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool.8
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool.8
@@ -1,9 +1,9 @@
'\" te
.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
-.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License.
-.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License.
-.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH zpool 1M "5 Mar 2009" "SunOS 5.11" "System Administration Commands"
+.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
+.TH zpool 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands"
.SH NAME
zpool \- configures ZFS storage pools
.SH SYNOPSIS
@@ -14,125 +14,125 @@ zpool \- configures ZFS storage pools
.LP
.nf
-\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR]
- ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...
+\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...
.fi
.LP
.nf
-\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR
+\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
.fi
.LP
.nf
-\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...
+\fBzpool clear\fR \fIpool\fR [\fIdevice\fR]
.fi
.LP
.nf
-\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...
+\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR]
+ ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...
.fi
.LP
.nf
-\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ...
+\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR
.fi
.LP
.nf
-\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]
+\fBzpool detach\fR \fIpool\fR \fIdevice\fR
.fi
.LP
.nf
-\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...
+\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...
.fi
.LP
.nf
-\fBzpool online\fR \fIpool\fR \fIdevice\fR ...
+\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...
.fi
.LP
.nf
-\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...
+\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...
.fi
.LP
.nf
-\fBzpool clear\fR \fIpool\fR [\fIdevice\fR]
+\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]
.fi
.LP
.nf
-\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
+\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR]
+ [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fB-a\fR
.fi
.LP
.nf
-\fBzpool detach\fR \fIpool\fR \fIdevice\fR
+\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR]
+ [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR |\fIid\fR [\fInewpool\fR]
.fi
.LP
.nf
-\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR]
+\fBzpool iostat\fR [\fB-T\fR u | d ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]
.fi
.LP
.nf
-\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...
+\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ...
.fi
.LP
.nf
-\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]
+\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...
.fi
.LP
.nf
-\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-p\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR]
- [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fB-a\fR
+\fBzpool online\fR \fIpool\fR \fIdevice\fR ...
.fi
.LP
.nf
-\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR]
- [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR |\fIid\fR [\fInewpool\fR]
+\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...
.fi
.LP
.nf
-\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...
+\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR]
.fi
.LP
.nf
-\fBzpool upgrade\fR
+\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...
.fi
.LP
.nf
-\fBzpool upgrade\fR \fB-v\fR
+\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR
.fi
.LP
.nf
-\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...
+\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...
.fi
.LP
.nf
-\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...
+\fBzpool upgrade\fR
.fi
.LP
.nf
-\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...
+\fBzpool upgrade\fR \fB-v\fR
.fi
.LP
.nf
-\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR
+\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...
.fi
.SH DESCRIPTION
@@ -141,8 +141,8 @@ zpool \- configures ZFS storage pools
The \fBzpool\fR command configures \fBZFS\fR storage pools. A storage pool is a collection of devices that provides physical storage and data replication for \fBZFS\fR datasets.
.sp
.LP
-All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for information on managing datasets.
-.SS "Virtual Devices (vdevs)"
+All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for information on managing datasets.
+.SS "Virtual Devices (\fBvdev\fRs)"
.sp
.LP
A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported:
@@ -150,18 +150,18 @@ A "virtual device" describes a single device or a collection of devices organize
.ne 2
.mk
.na
-\fBdisk\fR
+\fB\fBdisk\fR\fR
.ad
.RS 10n
.rt
-A block device, typically located under "/dev/dsk". \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name (the relative portion of the path under "/dev/dsk"). A whole disk can be specified by omitting the slice or partition designation. For example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole disk, \fBZFS\fR automatically labels the disk, if necessary.
+A block device, typically located under \fB/dev/dsk\fR. \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name (the relative portion of the path under "/dev/dsk"). A whole disk can be specified by omitting the slice or partition designation. For example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole disk, \fBZFS\fR automatically labels the disk, if necessary.
.RE
.sp
.ne 2
.mk
.na
-\fBfile\fR
+\fB\fBfile\fR\fR
.ad
.RS 10n
.rt
@@ -172,7 +172,7 @@ A regular file. The use of files as a backing store is strongly discouraged. It
.ne 2
.mk
.na
-\fBmirror\fR
+\fB\fBmirror\fR\fR
.ad
.RS 10n
.rt
@@ -183,21 +183,25 @@ A mirror of two or more devices. Data is replicated in an identical fashion acro
.ne 2
.mk
.na
-\fBraidz\fR
+\fB\fBraidz\fR\fR
+.ad
+.br
+.na
+\fB\fBraidz1\fR\fR
.ad
.br
.na
-\fBraidz1\fR
+\fB\fBraidz2\fR\fR
.ad
.br
.na
-\fBraidz2\fR
+\fB\fBraidz3\fR\fR
.ad
.RS 10n
.rt
A variation on \fBRAID-5\fR that allows for better distribution of parity and eliminates the "\fBRAID-5\fR write hole" (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks within a \fBraidz\fR group.
.sp
-A \fBraidz\fR group can have either single- or double-parity, meaning that the \fBraidz\fR group can sustain one or two failures respectively without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a single-parity \fBraidz\fR group and the \fBraidz2\fR \fBvdev\fR type specifies a double-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias for \fBraidz1\fR.
+A \fBraidz\fR group can have single-, double- , or triple parity, meaning that the \fBraidz\fR group can sustain one, two, or three failures, respectively, without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a single-parity \fBraidz\fR group; the \fBraidz2\fR \fBvdev\fR type specifies a double-parity \fBraidz\fR group; and the \fBraidz3\fR \fBvdev\fR type specifies a triple-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias for \fBraidz1\fR.
.sp
A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity disks can hold approximately (\fIN-P\fR)*\fIX\fR bytes and can withstand \fIP\fR device(s) failing before data integrity is compromised. The minimum number of devices in a \fBraidz\fR group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance.
.RE
@@ -206,7 +210,7 @@ A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity disks
.ne 2
.mk
.na
-\fBspare\fR
+\fB\fBspare\fR\fR
.ad
.RS 10n
.rt
@@ -217,22 +221,22 @@ A special pseudo-\fBvdev\fR which keeps track of available hot spares for a pool
.ne 2
.mk
.na
-\fBlog\fR
+\fB\fBlog\fR\fR
.ad
.RS 10n
.rt
-A separate intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, \fBraidz\fR and \fBraidz2\fR are not supported for the intent log. For more information, see the "Intent Log" section.
+A separate-intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, \fBraidz\fR \fBvdev\fR types are not supported for the intent log. For more information, see the "Intent Log" section.
.RE
.sp
.ne 2
.mk
.na
-\fBcache\fR
+\fB\fBcache\fR\fR
.ad
.RS 10n
.rt
-A device used to cache storage pool data. A cache device cannot be mirrored or part of a \fBraidz\fR or \fBraidz2\fR configuration. For more information, see the "Cache Devices" section.
+A device used to cache storage pool data. A cache device cannot be cannot be configured as a mirror or \fBraidz\fR group. For more information, see the "Cache Devices" section.
.RE
.sp
@@ -247,7 +251,7 @@ Virtual devices are specified one at a time on the command line, separated by wh
.sp
.in +2
.nf
-\fB# zpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR
+# \fBzpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR
.fi
.in -2
.sp
@@ -403,7 +407,7 @@ The \fBZFS\fR Intent Log (\fBZIL\fR) satisfies \fBPOSIX\fR requirements for sync
Multiple log devices can also be specified, and they can be mirrored. See the EXAMPLES section for an example of mirroring multiple log devices.
.sp
.LP
-Log devices can be added, replaced, attached, detached, and imported and exported as part of the larger pool.
+Log devices can be added, replaced, attached, detached, and imported and exported as part of the larger pool. Mirrored log devices can be removed by specifying the top-level mirror for the log.
.SS "Cache Devices"
.sp
.LP
@@ -433,7 +437,7 @@ Each pool has several properties associated with it. Some properties are read-on
.ne 2
.mk
.na
-\fBavailable\fR
+\fB\fBavailable\fR\fR
.ad
.RS 20n
.rt
@@ -444,7 +448,7 @@ Amount of storage available within the pool. This property can also be referred
.ne 2
.mk
.na
-\fBcapacity\fR
+\fB\fBcapacity\fR\fR
.ad
.RS 20n
.rt
@@ -455,7 +459,7 @@ Percentage of pool space used. This property can also be referred to by its shor
.ne 2
.mk
.na
-\fBhealth\fR
+\fB\fBhealth\fR\fR
.ad
.RS 20n
.rt
@@ -466,7 +470,7 @@ The current health of the pool. Health can be "\fBONLINE\fR", "\fBDEGRADED\fR",
.ne 2
.mk
.na
-\fBguid\fR
+\fB\fBguid\fR\fR
.ad
.RS 20n
.rt
@@ -477,7 +481,7 @@ A unique identifier for the pool.
.ne 2
.mk
.na
-\fBsize\fR
+\fB\fBsize\fR\fR
.ad
.RS 20n
.rt
@@ -488,7 +492,7 @@ Total size of the storage pool.
.ne 2
.mk
.na
-\fBused\fR
+\fB\fBused\fR\fR
.ad
.RS 20n
.rt
@@ -514,12 +518,23 @@ Alternate root directory. If set, this directory is prepended to any mount point
.sp
.LP
-The following properties can be set at creation time and import time, and later changed with the "\fBzpool set\fR" command:
+The following properties can be set at creation time and import time, and later changed with the \fBzpool set\fR command:
.sp
.ne 2
.mk
.na
-\fB\fBautoreplace\fR=on | off\fR
+\fB\fBautoexpand\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls automatic pool expansion when the underlying LUN is grown. If set to \fBon\fR, the pool will be resized according to the size of the expanded device. If the device is part of a mirror or \fBraidz\fR then all devices within that mirror/\fBraidz\fR group must be expanded before the new space is made available to the pool. The default behavior is \fBoff\fR. This property can also be referred to by its shortened column name, \fBexpand\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBautoreplace\fR=\fBon\fR | \fBoff\fR\fR
.ad
.sp .6
.RS 4n
@@ -541,7 +556,7 @@ Identifies the default bootable dataset for the root pool. This property is expe
.ne 2
.mk
.na
-\fB\fBcachefile\fR=\fIpath\fR | "none"\fR
+\fB\fBcachefile\fR=\fIpath\fR | \fBnone\fR\fR
.ad
.sp .6
.RS 4n
@@ -574,7 +589,7 @@ Controls the system behavior in the event of catastrophic pool failure. This con
.ne 2
.mk
.na
-\fBwait\fR
+\fB\fBwait\fR\fR
.ad
.RS 12n
.rt
@@ -585,7 +600,7 @@ Blocks all \fBI/O\fR access until the device connectivity is recovered and the e
.ne 2
.mk
.na
-\fBcontinue\fR
+\fB\fBcontinue\fR\fR
.ad
.RS 12n
.rt
@@ -596,7 +611,7 @@ Returns \fBEIO\fR to any new write \fBI/O\fR requests but allows reads to any of
.ne 2
.mk
.na
-\fBpanic\fR
+\fB\fBpanic\fR\fR
.ad
.RS 12n
.rt
@@ -613,7 +628,7 @@ Prints out a message to the console and generates a system crash dump.
.ad
.sp .6
.RS 4n
-Controls whether information about snapshots associated with this pool is output when "\fBzfs list\fR" is run without the \fB-t\fR option. The default value is "off".
+Controls whether information about snapshots associated with this pool is output when "\fBzfs list\fR" is run without the \fB-t\fR option. The default value is "off".
.RE
.sp
@@ -649,25 +664,19 @@ Displays a help message.
.ne 2
.mk
.na
-\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR
+\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR
.ad
.sp .6
.RS 4n
-Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is described in the "Virtual Devices" section.
-.sp
-The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by \fBZFS\fR. Other uses, such as having a preexisting \fBUFS\fR file system, can be overridden with the \fB-f\fR option.
-.sp
-The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless \fB-f\fR is specified. The use of differently sized devices within a single \fBraidz\fR or mirror group is also flagged as an error unless \fB-f\fR is specified.
-.sp
-Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
+Adds the specified virtual devices to the given pool. The \fIvdev\fR specification is described in the "Virtual Devices" section. The behavior of the \fB-f\fR option, and the device checks performed are described in the "zpool create" subcommand.
.sp
.ne 2
.mk
.na
\fB\fB-f\fR\fR
.ad
-.sp .6
-.RS 4n
+.RS 6n
+.rt
Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
.RE
@@ -677,57 +686,32 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r
.na
\fB\fB-n\fR\fR
.ad
-.sp .6
-.RS 4n
-Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR
-.ad
-.sp .6
-.RS 4n
-Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set.
+.RS 6n
+.rt
+Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
.RE
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-O\fR \fIfile-system-property=value\fR\fR
-.ad
-.br
-.na
-\fB[\fB-O\fR \fIfile-system-property=value\fR] ...\fR
-.ad
-.sp .6
-.RS 4n
-Sets the given file system properties in the root file system of the pool. See the "Properties" section of \fBzfs\fR(1M) for a list of valid properties that can be set.
+Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device.
.RE
.sp
.ne 2
.mk
.na
-\fB\fB-R\fR \fIroot\fR\fR
+\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
.ad
.sp .6
.RS 4n
-Equivalent to "-o cachefile=none,altroot=\fIroot\fR"
-.RE
-
+Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not currently part of a mirrored configuration, \fIdevice\fR automatically transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If \fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately.
.sp
.ne 2
.mk
.na
-\fB\fB-m\fR \fImountpoint\fR\fR
+\fB\fB-f\fR\fR
.ad
-.sp .6
-.RS 4n
-Sets the mount point for the root dataset. The default mount point is "/\fIpool\fR" or "\fBaltroot\fR/\fIpool\fR" if \fBaltroot\fR is specified. The mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more information on dataset mount points, see \fBzfs\fR(1M).
+.RS 6n
+.rt
+Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
.RE
.RE
@@ -736,41 +720,36 @@ Sets the mount point for the root dataset. The default mount point is "/\fIpool\
.ne 2
.mk
.na
-\fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR
+\fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR
.ad
.sp .6
.RS 4n
-Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool.
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-f\fR\fR
-.ad
-.RS 6n
-.rt
-Forces any active datasets contained within the pool to be unmounted.
-.RE
-
+Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR
+\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR
.ad
.sp .6
.RS 4n
-Adds the specified virtual devices to the given pool. The \fIvdev\fR specification is described in the "Virtual Devices" section. The behavior of the \fB-f\fR option, and the device checks performed are described in the "zpool create" subcommand.
+Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is described in the "Virtual Devices" section.
+.sp
+The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by \fBZFS\fR. Other uses, such as having a preexisting \fBUFS\fR file system, can be overridden with the \fB-f\fR option.
+.sp
+The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless \fB-f\fR is specified. The use of differently sized devices within a single \fBraidz\fR or mirror group is also flagged as an error unless \fB-f\fR is specified.
+.sp
+Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
.sp
.ne 2
.mk
.na
\fB\fB-f\fR\fR
.ad
-.RS 6n
-.rt
+.sp .6
+.RS 4n
Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
.RE
@@ -780,148 +759,79 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r
.na
\fB\fB-n\fR\fR
.ad
-.RS 6n
-.rt
-Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
-.RE
-
-Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...\fR
-.ad
.sp .6
.RS 4n
-Removes the specified device from the pool. This command currently only supports removing hot spares and cache devices. Devices that are part of a mirrored configuration can be removed using the "\fBzpool detach\fR" command. Non-redundant and \fBraidz\fR devices cannot be removed from a pool.
+Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ...\fR
+\fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR
.ad
.sp .6
.RS 4n
-Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed.
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-H\fR\fR
-.ad
-.RS 12n
-.rt
-Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
+Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set.
.RE
.sp
.ne 2
.mk
.na
-\fB\fB-o\fR \fIprops\fR\fR
+\fB\fB-O\fR \fIfile-system-property=value\fR\fR
.ad
-.RS 12n
-.rt
-Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, capacity, health, altroot"
-.RE
-
-.RE
-
-.sp
-.ne 2
-.mk
+.br
.na
-\fB\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR
+\fB[\fB-O\fR \fIfile-system-property=value\fR] ...\fR
.ad
.sp .6
.RS 4n
-Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-v\fR\fR
-.ad
-.RS 6n
-.rt
-Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics.
-.RE
-
+Sets the given file system properties in the root file system of the pool. See the "Properties" section of \fBzfs\fR(1M) for a list of valid properties that can be set.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR
+\fB\fB-R\fR \fIroot\fR\fR
.ad
.sp .6
.RS 4n
-Displays the detailed health status for the given pools. If no \fIpool\fR is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the "Device Failure and Recovery" section.
-.sp
-If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change.
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-x\fR\fR
-.ad
-.RS 6n
-.rt
-Only display status for pools that are exhibiting errors or are otherwise unavailable.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-v\fR\fR
-.ad
-.RS 6n
-.rt
-Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub.
-.RE
-
+Equivalent to "-o cachefile=none,altroot=\fIroot\fR"
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool online\fR \fIpool\fR \fIdevice\fR ...\fR
+\fB\fB-m\fR \fImountpoint\fR\fR
.ad
.sp .6
.RS 4n
-Brings the specified physical device online.
-.sp
-This command is not applicable to spares or cache devices.
+Sets the mount point for the root dataset. The default mount point is "/\fIpool\fR" or "\fBaltroot\fR/\fIpool\fR" if \fBaltroot\fR is specified. The mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more information on dataset mount points, see \fBzfs\fR(1M).
+.RE
+
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR
+\fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR
.ad
.sp .6
.RS 4n
-Takes the specified physical device offline. While the \fIdevice\fR is offline, no attempt is made to read or write to the device.
-.sp
-This command is not applicable to spares or cache devices.
+Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool.
.sp
.ne 2
.mk
.na
-\fB\fB-t\fR\fR
+\fB\fB-f\fR\fR
.ad
.RS 6n
.rt
-Temporary. Upon reboot, the specified physical device reverts to its previous state.
+Forces any active datasets contained within the pool to be unmounted.
.RE
.RE
@@ -930,22 +840,26 @@ Temporary. Upon reboot, the specified physical device reverts to its previous st
.ne 2
.mk
.na
-\fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR
+\fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR
.ad
.sp .6
.RS 4n
-Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared.
+Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
+\fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR
.ad
.sp .6
.RS 4n
-Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not currently part of a mirrored configuration, \fIdevice\fR automatically transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If \fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately.
+Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present.
+.sp
+Before exporting the pool, all datasets within the pool are unmounted. A pool can not be exported if it has a shared spare that is currently being used.
+.sp
+For pools to be portable, you must give the \fBzpool\fR command whole disks, not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks.
.sp
.ne 2
.mk
@@ -954,7 +868,9 @@ Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device
.ad
.RS 6n
.rt
-Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
+Forcefully unmount all datasets, using the "\fBunmount -f\fR" command.
+.sp
+This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption.
.RE
.RE
@@ -963,61 +879,54 @@ Forces use of \fInew_device\fR, even if its appears to be in use. Not all device
.ne 2
.mk
.na
-\fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR
+\fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR
.ad
.sp .6
.RS 4n
-Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data.
+Retrieves the given list of properties (or all properties if "\fBall\fR" is used) for the specified storage pool(s). These properties are displayed with the following fields:
+.sp
+.in +2
+.nf
+ name Name of storage pool
+ property Property name
+ value Property value
+ source Property source, either 'default' or 'local'.
+.fi
+.in -2
+.sp
+
+See the "Properties" section for more information on the available pool properties.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR
+\fB\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...\fR
.ad
.sp .6
.RS 4n
-Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to attaching \fInew_device\fR, waiting for it to resilver, and then detaching \fIold_device\fR.
-.sp
-The size of \fInew_device\fR must be greater than or equal to the minimum size of all the devices in a mirror or \fBraidz\fR configuration.
-.sp
-\fInew_device\fR is required if the pool is not redundant. If \fInew_device\fR is not specified, it defaults to \fIold_device\fR. This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same \fB/dev/dsk\fR path as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this.
+Displays the command history of the specified pools or all pools if no pool is specified.
.sp
.ne 2
.mk
.na
-\fB\fB-f\fR\fR
+\fB\fB-i\fR\fR
.ad
.RS 6n
.rt
-Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
-.RE
-
+Displays internally logged \fBZFS\fR events in addition to user initiated events.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR
-.ad
-.sp .6
-.RS 4n
-Begins a scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, \fBZFS\fR automatically repairs any damage discovered during the scrub. The "\fBzpool status\fR" command reports the progress of the scrub and summarizes the results of the scrub upon completion.
-.sp
-Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that \fBZFS\fR knows to be out of date (for example, when attaching a new device to a mirror or replacing an existing device), whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure.
-.sp
-Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR only allows one at a time. If a scrub is already in progress, the "\fBzpool scrub\fR" command terminates it and starts a new scrub. If a resilver is in progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes.
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-s\fR\fR
+\fB\fB-l\fR\fR
.ad
.RS 6n
.rt
-Stop scrubbing.
+Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed.
.RE
.RE
@@ -1261,26 +1170,66 @@ Sets the "\fBcachefile\fR" property to "\fBnone\fR" and the "\fIaltroot\fR" prop
.ne 2
.mk
.na
-\fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR
+\fB\fBzpool iostat\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR
.ad
.sp .6
.RS 4n
-Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present.
+Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
.sp
-Before exporting the pool, all datasets within the pool are unmounted. A pool can not be exported if it has a shared spare that is currently being used.
+.ne 2
+.mk
+.na
+\fB\fB-T\fR \fBu\fR | \fBd\fR\fR
+.ad
+.RS 12n
+.rt
+Display a time stamp.
.sp
-For pools to be portable, you must give the \fBzpool\fR command whole disks, not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks.
+Specify \fBu\fR for a printed representation of the internal representation of time. See \fBtime\fR(2). Specify \fBd\fR for standard date format. See \fBdate\fR(1).
+.RE
+
.sp
.ne 2
.mk
.na
-\fB\fB-f\fR\fR
+\fB\fB-v\fR\fR
.ad
-.RS 6n
+.RS 12n
.rt
-Forcefully unmount all datasets, using the "\fBunmount -f\fR" command.
+Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics.
+.RE
+
+.RE
+
.sp
-This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption.
+.ne 2
+.mk
+.na
+\fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.RS 12n
+.rt
+Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIprops\fR\fR
+.ad
+.RS 12n
+.rt
+Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, capacity, health, altroot"
.RE
.RE
@@ -1289,53 +1238,109 @@ This command will forcefully export the pool even if it has a shared spare that
.ne 2
.mk
.na
-\fB\fBzpool upgrade\fR\fR
+\fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR
.ad
.sp .6
.RS 4n
-Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with a more recent version are also displayed, although these pools will be inaccessible on the system.
+Takes the specified physical device offline. While the \fIdevice\fR is offline, no attempt is made to read or write to the device.
+.sp
+This command is not applicable to spares or cache devices.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-t\fR\fR
+.ad
+.RS 6n
+.rt
+Temporary. Upon reboot, the specified physical device reverts to its previous state.
+.RE
+
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool upgrade\fR \fB-v\fR\fR
+\fB\fBzpool online\fR [\fB-e\fR] \fIpool\fR \fIdevice\fR...\fR
.ad
.sp .6
.RS 4n
-Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supported versions are displayed, along with an explanation of the features provided with each version.
+Brings the specified physical device online.
+.sp
+This command is not applicable to spares or cache devices.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-e\fR\fR
+.ad
+.RS 6n
+.rt
+Expand the device to use all available space. If the device is part of a mirror or \fBraidz\fR then all devices must be expanded before the new space will become available to the pool.
+.RE
+
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...\fR
+\fB\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...\fR
.ad
.sp .6
.RS 4n
-Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
+Removes the specified device from the pool. This command currently only supports removing hot spares, cache, and log devices. A mirrored log device can be removed by specifying the top-level mirror for the log. Non-log devices that are part of a mirrored configuration can be removed using the \fBzpool detach\fR command. Non-redundant and \fBraidz\fR devices cannot be removed from a pool.
+.RE
+
.sp
.ne 2
.mk
.na
-\fB\fB-a\fR\fR
+\fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR
.ad
-.RS 14n
+.sp .6
+.RS 4n
+Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to attaching \fInew_device\fR, waiting for it to resilver, and then detaching \fIold_device\fR.
+.sp
+The size of \fInew_device\fR must be greater than or equal to the minimum size of all the devices in a mirror or \fBraidz\fR configuration.
+.sp
+\fInew_device\fR is required if the pool is not redundant. If \fInew_device\fR is not specified, it defaults to \fIold_device\fR. This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same \fB/dev/dsk\fR path as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
.rt
-Upgrades all pools.
+Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
+.RE
+
.RE
.sp
.ne 2
.mk
.na
-\fB\fB-V\fR \fIversion\fR\fR
+\fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR
.ad
-.RS 14n
+.sp .6
+.RS 4n
+Begins a scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, \fBZFS\fR automatically repairs any damage discovered during the scrub. The "\fBzpool status\fR" command reports the progress of the scrub and summarizes the results of the scrub upon completion.
+.sp
+Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that \fBZFS\fR knows to be out of date (for example, when attaching a new device to a mirror or replacing an existing device), whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure.
+.sp
+Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR only allows one at a time. If a scrub is already in progress, the "\fBzpool scrub\fR" command terminates it and starts a new scrub. If a resilver is in progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR\fR
+.ad
+.RS 6n
.rt
-Upgrade to the specified version. If the \fB-V\fR flag is not specified, the pool is upgraded to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software.
+Stop scrubbing.
.RE
.RE
@@ -1344,31 +1349,44 @@ Upgrade to the specified version. If the \fB-V\fR flag is not specified, the poo
.ne 2
.mk
.na
-\fB\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...\fR
+\fB\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR\fR
.ad
.sp .6
.RS 4n
-Displays the command history of the specified pools or all pools if no pool is specified.
+Sets the given property on the specified pool. See the "Properties" section for more information on what properties can be set and acceptable values.
+.RE
+
.sp
.ne 2
.mk
.na
-\fB\fB-i\fR\fR
+\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Displays the detailed health status for the given pools. If no \fIpool\fR is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the "Device Failure and Recovery" section.
+.sp
+If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-x\fR\fR
.ad
.RS 6n
.rt
-Displays internally logged \fBZFS\fR events in addition to user initiated events.
+Only display status for pools that are exhibiting errors or are otherwise unavailable.
.RE
.sp
.ne 2
.mk
.na
-\fB\fB-l\fR\fR
+\fB\fB-v\fR\fR
.ad
.RS 6n
.rt
-Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed.
+Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub.
.RE
.RE
@@ -1377,34 +1395,55 @@ Displays log records in long format, which in addition to standard format includ
.ne 2
.mk
.na
-\fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR
+\fB\fBzpool upgrade\fR\fR
.ad
.sp .6
.RS 4n
-Retrieves the given list of properties (or all properties if "\fBall\fR" is used) for the specified storage pool(s). These properties are displayed with the following fields:
-.sp
-.in +2
-.nf
- name Name of storage pool
- property Property name
- value Property value
- source Property source, either 'default' or 'local'.
-.fi
-.in -2
-.sp
+Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with a more recent version are also displayed, although these pools will be inaccessible on the system.
+.RE
-See the "Properties" section for more information on the available pool properties.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool upgrade\fR \fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supported versions are displayed, along with an explanation of the features provided with each version.
.RE
.sp
.ne 2
.mk
.na
-\fB\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR\fR
+\fB\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...\fR
.ad
.sp .6
.RS 4n
-Sets the given property on the specified pool. See the "Properties" section for more information on what properties can be set and acceptable values.
+Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.RS 14n
+.rt
+Upgrades all pools.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-V\fR \fIversion\fR\fR
+.ad
+.RS 14n
+.rt
+Upgrade to the specified version. If the \fB-V\fR flag is not specified, the pool is upgraded to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software.
+.RE
+
.RE
.SH EXAMPLES
@@ -1417,7 +1456,7 @@ The following command creates a pool with a single \fBraidz\fR root \fIvdev\fR t
.sp
.in +2
.nf
-\fB# zpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR
+# \fBzpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR
.fi
.in -2
.sp
@@ -1431,7 +1470,7 @@ The following command creates a pool with two mirrors, where each mirror contain
.sp
.in +2
.nf
-\fB# zpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR
+# \fBzpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR
.fi
.in -2
.sp
@@ -1445,7 +1484,7 @@ The following command creates an unmirrored pool using two disk slices.
.sp
.in +2
.nf
-\fB# zpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR
+# \fBzpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR
.fi
.in -2
.sp
@@ -1459,7 +1498,7 @@ The following command creates an unmirrored pool using files. While not recommen
.sp
.in +2
.nf
-\fB# zpool create tank /path/to/file/a /path/to/file/b\fR
+# \fBzpool create tank /path/to/file/a /path/to/file/b\fR
.fi
.in -2
.sp
@@ -1473,7 +1512,7 @@ The following command adds two mirrored disks to the pool "\fItank\fR", assuming
.sp
.in +2
.nf
-\fB# zpool add tank mirror c1t0d0 c1t1d0\fR
+# \fBzpool add tank mirror c1t0d0 c1t1d0\fR
.fi
.in -2
.sp
@@ -1491,7 +1530,7 @@ The results from this command are similar to the following:
.sp
.in +2
.nf
-\fB# zpool list\fR
+# \fBzpool list\fR
NAME SIZE USED AVAIL CAP HEALTH ALTROOT
pool 67.5G 2.92M 67.5G 0% ONLINE -
tank 67.5G 2.92M 67.5G 0% ONLINE -
@@ -1509,7 +1548,7 @@ The following command destroys the pool "\fItank\fR" and any datasets contained
.sp
.in +2
.nf
-\fB# zpool destroy -f tank\fR
+# \fBzpool destroy -f tank\fR
.fi
.in -2
.sp
@@ -1523,7 +1562,7 @@ The following command exports the devices in pool \fItank\fR so that they can be
.sp
.in +2
.nf
-\fB# zpool export tank\fR
+# \fBzpool export tank\fR
.fi
.in -2
.sp
@@ -1541,7 +1580,7 @@ The results from this command are similar to the following:
.sp
.in +2
.nf
-\fB# zpool import\fR
+# \fBzpool import\fR
pool: tank
id: 15451357997522795478
state: ONLINE
@@ -1553,7 +1592,7 @@ config:
c1t2d0 ONLINE
c1t3d0 ONLINE
-\fB# zpool import tank\fR
+# \fBzpool import tank\fR
.fi
.in -2
.sp
@@ -1567,7 +1606,7 @@ The following command upgrades all ZFS Storage pools to the current version of t
.sp
.in +2
.nf
-\fB# zpool upgrade -a\fR
+# \fBzpool upgrade -a\fR
This system is currently running ZFS version 2.
.fi
.in -2
@@ -1582,7 +1621,7 @@ The following command creates a new pool with an available hot spare:
.sp
.in +2
.nf
-\fB# zpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR
+# \fBzpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR
.fi
.in -2
.sp
@@ -1594,7 +1633,7 @@ If one of the disks were to fail, the pool would be reduced to the degraded stat
.sp
.in +2
.nf
-\fB# zpool replace tank c0t0d0 c0t3d0\fR
+# \fBzpool replace tank c0t0d0 c0t3d0\fR
.fi
.in -2
.sp
@@ -1606,7 +1645,7 @@ Once the data has been resilvered, the spare is automatically removed and is mad
.sp
.in +2
.nf
-\fB# zpool remove tank c0t2d0\fR
+# \fBzpool remove tank c0t2d0\fR
.fi
.in -2
.sp
@@ -1620,7 +1659,7 @@ The following command creates a ZFS storage pool consisting of two, two-way mirr
.sp
.in +2
.nf
-\fB# zpool create pool mirror c0d0 c1d0 mirror c2d0 c3d0 log mirror \e
+# \fBzpool create pool mirror c0d0 c1d0 mirror c2d0 c3d0 log mirror \e
c4d0 c5d0\fR
.fi
.in -2
@@ -1635,7 +1674,7 @@ The following command adds two disks for use as cache devices to a ZFS storage p
.sp
.in +2
.nf
-\fB# zpool add pool cache c2d0 c3d0\fR
+# \fBzpool add pool cache c2d0 c3d0\fR
.fi
.in -2
.sp
@@ -1643,10 +1682,57 @@ The following command adds two disks for use as cache devices to a ZFS storage p
.sp
.LP
Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the \fBiostat\fR option as follows:
+
+.sp
+.in +2
+.nf
+# \fBzpool iostat -v pool 5\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 14 \fRRemoving a Mirrored Log Device
+.sp
+.LP
+The following command removes the mirrored log device \fBmirror-2\fR.
+
+.sp
+.LP
+Given this configuration:
+
+.sp
+.in +2
+.nf
+ pool: tank
+ state: ONLINE
+ scrub: none requested
+config:
+
+ NAME STATE READ WRITE CKSUM
+ tank ONLINE 0 0 0
+ mirror-0 ONLINE 0 0 0
+ c6t0d0 ONLINE 0 0 0
+ c6t1d0 ONLINE 0 0 0
+ mirror-1 ONLINE 0 0 0
+ c6t2d0 ONLINE 0 0 0
+ c6t3d0 ONLINE 0 0 0
+ logs
+ mirror-2 ONLINE 0 0 0
+ c4t0d0 ONLINE 0 0 0
+ c4t1d0 ONLINE 0 0 0
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+The command to remove the mirrored log \fBmirror-2\fR is:
+
.sp
.in +2
.nf
-\fB# zpool iostat -v pool 5\fR
+# \fBzpool remove tank mirror-2\fR
.fi
.in -2
.sp
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
index 09cba89d7b9a..73e40ecc9b11 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <solaris.h>
@@ -44,7 +43,6 @@
#include <zone.h>
#include <sys/time.h>
#include <sys/fs/zfs.h>
-
#include <sys/stat.h>
#include <libzfs.h>
@@ -52,6 +50,8 @@
#include "zpool_util.h"
#include "zfs_comutil.h"
+#include "statcommon.h"
+
static int zpool_do_create(int, char **);
static int zpool_do_destroy(int, char **);
@@ -69,6 +69,7 @@ static int zpool_do_clear(int, char **);
static int zpool_do_attach(int, char **);
static int zpool_do_detach(int, char **);
static int zpool_do_replace(int, char **);
+static int zpool_do_split(int, char **);
static int zpool_do_scrub(int, char **);
@@ -121,7 +122,8 @@ typedef enum {
HELP_STATUS,
HELP_UPGRADE,
HELP_GET,
- HELP_SET
+ HELP_SET,
+ HELP_SPLIT
} zpool_help_t;
@@ -158,6 +160,7 @@ static zpool_command_t command_table[] = {
{ "attach", zpool_do_attach, HELP_ATTACH },
{ "detach", zpool_do_detach, HELP_DETACH },
{ "replace", zpool_do_replace, HELP_REPLACE },
+ { "split", zpool_do_split, HELP_SPLIT },
{ NULL },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ NULL },
@@ -175,6 +178,8 @@ static zpool_command_t command_table[] = {
zpool_command_t *current_command;
static char history_str[HIS_MAX_RECORD_LEN];
+static uint_t timestamp_fmt = NODATE;
+
static const char *
get_usage(zpool_help_t idx) {
switch (idx) {
@@ -184,7 +189,7 @@ get_usage(zpool_help_t idx) {
return (gettext("\tattach [-f] <pool> <device> "
"<new-device>\n"));
case HELP_CLEAR:
- return (gettext("\tclear <pool> [device]\n"));
+ return (gettext("\tclear [-nF] <pool> [device]\n"));
case HELP_CREATE:
return (gettext("\tcreate [-fn] [-o property=value] ... \n"
"\t [-O file-system-property=value] ... \n"
@@ -199,17 +204,20 @@ get_usage(zpool_help_t idx) {
return (gettext("\thistory [-il] [<pool>] ...\n"));
case HELP_IMPORT:
return (gettext("\timport [-d dir] [-D]\n"
+ "\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
"\timport [-o mntopts] [-o property=value] ... \n"
- "\t [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
+ "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+ "[-R root] [-F [-n]] -a\n"
"\timport [-o mntopts] [-o property=value] ... \n"
- "\t [-d dir | -c cachefile] [-D] [-f] [-R root] "
- "<pool | id> [newpool]\n"));
+ "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+ "[-R root] [-F [-n]]\n"
+ "\t <pool | id> [newpool]\n"));
case HELP_IOSTAT:
- return (gettext("\tiostat [-v] [pool] ... [interval "
+ return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
"[count]]\n"));
case HELP_LIST:
return (gettext("\tlist [-H] [-o property[,...]] "
- "[pool] ...\n"));
+ "[-T d|u] [pool] ... [interval [count]]\n"));
case HELP_OFFLINE:
return (gettext("\toffline [-t] <pool> <device> ...\n"));
case HELP_ONLINE:
@@ -222,7 +230,8 @@ get_usage(zpool_help_t idx) {
case HELP_SCRUB:
return (gettext("\tscrub [-s] <pool> ...\n"));
case HELP_STATUS:
- return (gettext("\tstatus [-vx] [pool] ...\n"));
+ return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval "
+ "[count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade\n"
"\tupgrade -v\n"
@@ -232,6 +241,10 @@ get_usage(zpool_help_t idx) {
"<pool> ...\n"));
case HELP_SET:
return (gettext("\tset <property=value> <pool> \n"));
+ case HELP_SPLIT:
+ return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
+ "\t [-o property=value] <pool> <newpool> "
+ "[<device> ...]\n"));
}
abort();
@@ -247,12 +260,12 @@ print_prop_cb(int prop, void *cb)
{
FILE *fp = cb;
- (void) fprintf(fp, "\t%-13s ", zpool_prop_to_name(prop));
+ (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop));
if (zpool_prop_readonly(prop))
(void) fprintf(fp, " NO ");
else
- (void) fprintf(fp, " YES ");
+ (void) fprintf(fp, " YES ");
if (zpool_prop_values(prop) == NULL)
(void) fprintf(fp, "-\n");
@@ -299,7 +312,7 @@ usage(boolean_t requested)
(void) fprintf(fp,
gettext("\nthe following properties are supported:\n"));
- (void) fprintf(fp, "\n\t%-13s %s %s\n\n",
+ (void) fprintf(fp, "\n\t%-15s %s %s\n\n",
"PROPERTY", "EDIT", "VALUES");
/* Iterate over all properties */
@@ -341,7 +354,7 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
if ((is_log && !print_logs) || (!is_log && print_logs))
continue;
- vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_vdev_tree(zhp, vname, child[c], indent + 2,
B_FALSE);
free(vname);
@@ -509,11 +522,10 @@ zpool_do_add(int argc, char **argv)
}
/*
- * zpool remove <pool> <vdev> ...
+ * zpool remove <pool> <vdev> ...
*
- * Removes the given vdev from the pool. Currently, this only supports removing
- * spares and cache devices from the pool. Eventually, we'll want to support
- * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs.
+ * Removes the given vdev from the pool. Currently, this supports removing
+ * spares, cache, and log devices from the pool.
*/
int
zpool_do_remove(int argc, char **argv)
@@ -942,7 +954,7 @@ zpool_do_export(int argc, char **argv)
static int
max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
{
- char *name = zpool_vdev_name(g_zfs, zhp, nv);
+ char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
nvlist_t **child;
uint_t c, children;
int ret;
@@ -1034,20 +1046,21 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
{
nvlist_t **child;
uint_t c, children;
+ pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
- char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+ char rbuf[6], wbuf[6], cbuf[6];
char *vname;
uint64_t notpresent;
spare_cbdata_t cb;
char *state;
- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
- (uint64_t **)&vs, &c) == 0);
-
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
/*
@@ -1125,31 +1138,43 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf(gettext("bad intent log"));
break;
+ case VDEV_AUX_EXTERNAL:
+ (void) printf(gettext("external device fault"));
+ break;
+
+ case VDEV_AUX_SPLIT_POOL:
+ (void) printf(gettext("split into new pool"));
+ break;
+
default:
(void) printf(gettext("corrupted data"));
break;
}
- } else if (vs->vs_scrub_repaired != 0 && children == 0) {
- /*
- * Report bytes resilvered/repaired on leaf devices.
- */
- zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
- (void) printf(gettext(" %s %s"), repaired,
- (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
- "resilvered" : "repaired");
+ }
+
+ (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &c);
+
+ if (ps && ps->pss_state == DSS_SCANNING &&
+ vs->vs_scan_processed != 0 && children == 0) {
+ (void) printf(gettext(" (%s)"),
+ (ps->pss_func == POOL_SCAN_RESILVER) ?
+ "resilvering" : "repairing");
}
(void) printf("\n");
for (c = 0; c < children; c++) {
- uint64_t is_log = B_FALSE;
+ uint64_t islog = B_FALSE, ishole = B_FALSE;
- /* Don't print logs here */
+ /* Don't print logs or holes here */
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log);
- if (is_log)
+ &islog);
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &ishole);
+ if (islog || ishole)
continue;
- vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
print_status_config(zhp, vname, child[c],
namewidth, depth + 2, isspare);
free(vname);
@@ -1170,10 +1195,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
char *type, *vname;
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
- if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+ if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(type, VDEV_TYPE_HOLE) == 0)
return;
- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
@@ -1222,7 +1248,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
if (is_log)
continue;
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
print_import_config(vname, child[c], namewidth, depth + 2);
free(vname);
}
@@ -1231,7 +1257,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
&child, &children) == 0) {
(void) printf(gettext("\tcache\n"));
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
@@ -1241,7 +1267,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
&child, &children) == 0) {
(void) printf(gettext("\tspares\n"));
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
@@ -1276,7 +1302,7 @@ print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
&is_log);
if (!is_log)
continue;
- name = zpool_vdev_name(g_zfs, zhp, child[c]);
+ name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
if (verbose)
print_status_config(zhp, name, child[c], namewidth,
2, B_FALSE);
@@ -1285,6 +1311,7 @@ print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
free(name);
}
}
+
/*
* Display the status for the given pool.
*/
@@ -1311,7 +1338,7 @@ show_import(nvlist_t *config)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
@@ -1378,6 +1405,11 @@ show_import(nvlist_t *config)
"read.\n"));
break;
+ case ZPOOL_STATUS_RESILVERING:
+ (void) printf(gettext("status: One or more devices were being "
+ "resilvered.\n"));
+ break;
+
default:
/*
* No other status can be seen when importing pools.
@@ -1471,13 +1503,12 @@ show_import(nvlist_t *config)
*/
static int
do_import(nvlist_t *config, const char *newname, const char *mntopts,
- int force, nvlist_t *props, boolean_t do_verbatim)
+ nvlist_t *props, int flags)
{
zpool_handle_t *zhp;
char *name;
uint64_t state;
uint64_t version;
- int error = 0;
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&name) == 0);
@@ -1490,7 +1521,8 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
(void) fprintf(stderr, gettext("cannot import '%s': pool "
"is formatted using a newer ZFS version\n"), name);
return (1);
- } else if (state != POOL_STATE_EXPORTED && !force) {
+ } else if (state != POOL_STATE_EXPORTED &&
+ !(flags & ZFS_IMPORT_ANY_HOST)) {
uint64_t hostid;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID,
@@ -1524,7 +1556,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
}
}
- if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
+ if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
return (1);
if (newname != NULL)
@@ -1534,13 +1566,14 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
return (1);
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+ !(flags & ZFS_IMPORT_ONLY) &&
zpool_enable_datasets(zhp, mntopts, 0) != 0) {
zpool_close(zhp);
return (1);
}
zpool_close(zhp);
- return (error);
+ return (0);
}
/*
@@ -1548,7 +1581,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
* [-d dir | -c cachefile] [-f] -a
* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
- * [-d dir | -c cachefile] [-f] <pool | id> [newpool]
+ * [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
*
* -c Read pool information from a cachefile instead of searching
* devices.
@@ -1563,14 +1596,23 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
* the given root. The pool will remain exported when the machine
* is rebooted.
*
- * -f Force import, even if it appears that the pool is active.
- *
- * -F Import even in the presence of faulted vdevs. This is an
+ * -V Import even in the presence of faulted vdevs. This is an
* intentionally undocumented option for testing purposes, and
* treats the pool configuration as complete, leaving any bad
* vdevs in the FAULTED state. In other words, it does verbatim
* import.
*
+ * -f Force import, even if it appears that the pool is active.
+ *
+ * -F Attempt rewind if necessary.
+ *
+ * -n See if rewind would work, but don't actually rewind.
+ *
+ * -N Import the pool but don't mount datasets.
+ *
+ * -T Specify a starting txg to use for import. This option is
+ * intentionally undocumented option for testing purposes.
+ *
* -a Import all pools found.
*
* -o Set property=value and/or temporary mount options (without '=').
@@ -1584,26 +1626,32 @@ zpool_do_import(int argc, char **argv)
char **searchdirs = NULL;
int nsearch = 0;
int c;
- int err;
+ int err = 0;
nvlist_t *pools = NULL;
boolean_t do_all = B_FALSE;
boolean_t do_destroyed = B_FALSE;
char *mntopts = NULL;
- boolean_t do_force = B_FALSE;
nvpair_t *elem;
nvlist_t *config;
uint64_t searchguid = 0;
char *searchname = NULL;
char *propval;
nvlist_t *found_config;
+ nvlist_t *policy = NULL;
nvlist_t *props = NULL;
boolean_t first;
- boolean_t do_verbatim = B_FALSE;
- uint64_t pool_state;
+ int flags = ZFS_IMPORT_NORMAL;
+ uint32_t rewind_policy = ZPOOL_NO_REWIND;
+ boolean_t dryrun = B_FALSE;
+ boolean_t do_rewind = B_FALSE;
+ boolean_t xtreme_rewind = B_FALSE;
+ uint64_t pool_state, txg = -1ULL;
char *cachefile = NULL;
+ importargs_t idata = { 0 };
+ char *endptr;
/* check options */
- while ((c = getopt(argc, argv, ":ac:d:DfFo:p:R:")) != -1) {
+ while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) {
switch (c) {
case 'a':
do_all = B_TRUE;
@@ -1628,10 +1676,19 @@ zpool_do_import(int argc, char **argv)
do_destroyed = B_TRUE;
break;
case 'f':
- do_force = B_TRUE;
+ flags |= ZFS_IMPORT_ANY_HOST;
break;
case 'F':
- do_verbatim = B_TRUE;
+ do_rewind = B_TRUE;
+ break;
+ case 'm':
+ flags |= ZFS_IMPORT_MISSING_LOG;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'N':
+ flags |= ZFS_IMPORT_ONLY;
break;
case 'o':
if ((propval = strchr(optarg, '=')) != NULL) {
@@ -1656,6 +1713,22 @@ zpool_do_import(int argc, char **argv)
ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
goto error;
break;
+ case 'T':
+ errno = 0;
+ txg = strtoull(optarg, &endptr, 10);
+ if (errno != 0 || *endptr != '\0') {
+ (void) fprintf(stderr,
+ gettext("invalid txg value\n"));
+ usage(B_FALSE);
+ }
+ rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
+ break;
+ case 'V':
+ flags |= ZFS_IMPORT_VERBATIM;
+ break;
+ case 'X':
+ xtreme_rewind = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@@ -1676,6 +1749,24 @@ zpool_do_import(int argc, char **argv)
usage(B_FALSE);
}
+ if ((dryrun || xtreme_rewind) && !do_rewind) {
+ (void) fprintf(stderr,
+ gettext("-n or -X only meaningful with -F\n"));
+ usage(B_FALSE);
+ }
+ if (dryrun)
+ rewind_policy = ZPOOL_TRY_REWIND;
+ else if (do_rewind)
+ rewind_policy = ZPOOL_DO_REWIND;
+ if (xtreme_rewind)
+ rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+ /* In the future, we can capture further policy and include it here */
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+ nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+ goto error;
+
if (searchdirs == NULL) {
searchdirs = safe_malloc(sizeof (char *));
searchdirs[0] = "/dev/dsk";
@@ -1703,6 +1794,7 @@ zpool_do_import(int argc, char **argv)
(void) fprintf(stderr, gettext("cannot "
"discover pools: permission denied\n"));
free(searchdirs);
+ nvlist_free(policy);
return (1);
}
}
@@ -1728,28 +1820,49 @@ zpool_do_import(int argc, char **argv)
if (errno != 0 || *endptr != '\0')
searchname = argv[0];
found_config = NULL;
- }
- if (cachefile) {
- pools = zpool_find_import_cached(g_zfs, cachefile, searchname,
- searchguid);
- } else if (searchname != NULL) {
- pools = zpool_find_import_byname(g_zfs, nsearch, searchdirs,
- searchname);
- } else {
/*
- * It's OK to search by guid even if searchguid is 0.
+ * User specified a name or guid. Ensure it's unique.
*/
- pools = zpool_find_import_byguid(g_zfs, nsearch, searchdirs,
- searchguid);
- }
-
- if (pools == NULL) {
+ idata.unique = B_TRUE;
+ }
+
+
+ idata.path = searchdirs;
+ idata.paths = nsearch;
+ idata.poolname = searchname;
+ idata.guid = searchguid;
+ idata.cachefile = cachefile;
+
+ pools = zpool_search_import(g_zfs, &idata);
+
+ if (pools != NULL && idata.exists &&
+ (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "a pool with that name already exists\n"),
+ argv[0]);
+ (void) fprintf(stderr, gettext("use the form '%s "
+ "<pool | id> <newpool>' to give it a new name\n"),
+ "zpool import");
+ err = 1;
+ } else if (pools == NULL && idata.exists) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "a pool with that name is already created/imported,\n"),
+ argv[0]);
+ (void) fprintf(stderr, gettext("and no additional pools "
+ "with that name were found\n"));
+ err = 1;
+ } else if (pools == NULL) {
if (argc != 0) {
(void) fprintf(stderr, gettext("cannot import '%s': "
"no such pool available\n"), argv[0]);
}
+ err = 1;
+ }
+
+ if (err == 1) {
free(searchdirs);
+ nvlist_free(policy);
return (1);
}
@@ -1773,17 +1886,21 @@ zpool_do_import(int argc, char **argv)
if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
continue;
+ verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+ policy) == 0);
+
if (argc == 0) {
if (first)
first = B_FALSE;
else if (!do_all)
(void) printf("\n");
- if (do_all)
+ if (do_all) {
err |= do_import(config, NULL, mntopts,
- do_force, props, do_verbatim);
- else
+ props, flags);
+ } else {
show_import(config);
+ }
} else if (searchname != NULL) {
char *name;
@@ -1829,7 +1946,7 @@ zpool_do_import(int argc, char **argv)
err = B_TRUE;
} else {
err |= do_import(found_config, argc == 1 ? NULL :
- argv[1], mntopts, do_force, props, do_verbatim);
+ argv[1], mntopts, props, flags);
}
}
@@ -1844,6 +1961,7 @@ zpool_do_import(int argc, char **argv)
error:
nvlist_free(props);
nvlist_free(pools);
+ nvlist_free(policy);
free(searchdirs);
return (err ? 1 : 0);
@@ -1871,7 +1989,7 @@ print_iostat_header(iostat_cbdata_t *cb)
{
(void) printf("%*s capacity operations bandwidth\n",
cb->cb_namewidth, "");
- (void) printf("%-*s used avail read write read write\n",
+ (void) printf("%-*s alloc free read write read write\n",
cb->cb_namewidth, "pool");
print_iostat_separator(cb);
}
@@ -1906,13 +2024,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
char *vname;
if (oldnv != NULL) {
- verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
- (uint64_t **)&oldvs, &c) == 0);
+ verify(nvlist_lookup_uint64_array(oldnv,
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
} else {
oldvs = &zerovs;
}
- verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&newvs, &c) == 0);
if (strlen(name) + depth > cb->cb_namewidth)
@@ -1962,7 +2080,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
return;
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ uint64_t ishole = B_FALSE;
+
+ if (nvlist_lookup_uint64(newchild[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+ continue;
+
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
@@ -1983,7 +2107,8 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
(void) printf("%-*s - - - - - "
"-\n", cb->cb_namewidth, "cache");
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
@@ -2072,42 +2197,14 @@ get_namewidth(zpool_handle_t *zhp, void *data)
}
/*
- * zpool iostat [-v] [pool] ... [interval [count]]
- *
- * -v Display statistics for individual vdevs
- *
- * This command can be tricky because we want to be able to deal with pool
- * creation/destruction as well as vdev configuration changes. The bulk of this
- * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
- * on pool_list_update() to detect the addition of new pools. Configuration
- * changes are all handled within libzfs.
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
*/
-int
-zpool_do_iostat(int argc, char **argv)
+static void
+get_interval_count(int *argcp, char **argv, unsigned long *iv,
+ unsigned long *cnt)
{
- int c;
- int ret;
- int npools;
unsigned long interval = 0, count = 0;
- zpool_list_t *list;
- boolean_t verbose = B_FALSE;
- iostat_cbdata_t cb;
-
- /* check options */
- while ((c = getopt(argc, argv, "v")) != -1) {
- switch (c) {
- case 'v':
- verbose = B_TRUE;
- break;
- case '?':
- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
- optopt);
- usage(B_FALSE);
- }
- }
-
- argc -= optind;
- argv += optind;
+ int argc = *argcp, errno;
/*
* Determine if the last argument is an integer or a pool name
@@ -2124,7 +2221,6 @@ zpool_do_iostat(int argc, char **argv)
"cannot be zero\n"));
usage(B_FALSE);
}
-
/*
* Ignore the last parameter
*/
@@ -2141,7 +2237,7 @@ zpool_do_iostat(int argc, char **argv)
/*
* If the last argument is also an integer, then we have both a count
- * and an integer.
+ * and an interval.
*/
if (argc > 0 && isdigit(argv[argc - 1][0])) {
char *end;
@@ -2166,6 +2262,66 @@ zpool_do_iostat(int argc, char **argv)
}
}
+ *iv = interval;
+ *cnt = count;
+ *argcp = argc;
+}
+
+static void
+get_timestamp_arg(char c)
+{
+ if (c == 'u')
+ timestamp_fmt = UDATE;
+ else if (c == 'd')
+ timestamp_fmt = DDATE;
+ else
+ usage(B_FALSE);
+}
+
+/*
+ * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]]
+ *
+ * -v Display statistics for individual vdevs
+ * -T Display a timestamp in date(1) or Unix format
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes. The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
+ * on pool_list_update() to detect the addition of new pools. Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+ int c;
+ int ret;
+ int npools;
+ unsigned long interval = 0, count = 0;
+ zpool_list_t *list;
+ boolean_t verbose = B_FALSE;
+ iostat_cbdata_t cb;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "T:v")) != -1) {
+ switch (c) {
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_interval_count(&argc, argv, &interval, &count);
+
/*
* Construct the list of all interesting pools.
*/
@@ -2212,6 +2368,9 @@ zpool_do_iostat(int argc, char **argv)
cb.cb_namewidth = 0;
(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
+
/*
* If it's the first time, or verbose mode, print the header.
*/
@@ -2363,12 +2522,13 @@ list_callback(zpool_handle_t *zhp, void *data)
}
/*
- * zpool list [-H] [-o prop[,prop]*] [pool] ...
+ * zpool list [-H] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
*
* -H Scripted mode. Don't display headers, and separate properties
* by a single tab.
* -o List of properties to display. Defaults to
- * "name,size,used,available,capacity,health,altroot"
+ * "name,size,allocated,free,capacity,health,altroot"
+ * -T Display a timestamp in date(1) or Unix format
*
* List all pools in the system, whether or not they're healthy. Output space
* statistics for each one, as well as health status summary.
@@ -2380,11 +2540,12 @@ zpool_do_list(int argc, char **argv)
int ret;
list_cbdata_t cb = { 0 };
static char default_props[] =
- "name,size,used,available,capacity,health,altroot";
+ "name,size,allocated,free,capacity,dedupratio,health,altroot";
char *props = default_props;
+ unsigned long interval = 0, count = 0;
/* check options */
- while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+ while ((c = getopt(argc, argv, ":Ho:T:")) != -1) {
switch (c) {
case 'H':
cb.cb_scripted = B_TRUE;
@@ -2392,6 +2553,9 @@ zpool_do_list(int argc, char **argv)
case 'o':
props = optarg;
break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@@ -2407,21 +2571,37 @@ zpool_do_list(int argc, char **argv)
argc -= optind;
argv += optind;
+ get_interval_count(&argc, argv, &interval, &count);
+
if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
usage(B_FALSE);
cb.cb_first = B_TRUE;
- ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
- list_callback, &cb);
+ for (;;) {
- zprop_free_list(cb.cb_proplist);
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
- if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
- (void) printf(gettext("no pools available\n"));
- return (0);
+ ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
+ list_callback, &cb);
+
+ if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
+ (void) printf(gettext("no pools available\n"));
+ zprop_free_list(cb.cb_proplist);
+ return (0);
+ }
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) sleep(interval);
}
+ zprop_free_list(cb.cb_proplist);
return (ret);
}
@@ -2436,10 +2616,10 @@ zpool_get_vdev_by_name(nvlist_t *nv, char *name)
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) {
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
- if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV)-1) == 0)
- name += sizeof(_PATH_DEV)-1;
- if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV)-1) == 0)
- path += sizeof(_PATH_DEV)-1;
+ if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+ name += sizeof(_PATH_DEV) - 1;
+ if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+ path += sizeof(_PATH_DEV) - 1;
if (strcmp(name, path) == 0)
return (nv);
return (NULL);
@@ -2628,6 +2808,146 @@ zpool_do_detach(int argc, char **argv)
}
/*
+ * zpool split [-n] [-o prop=val] ...
+ * [-o mntopt] ...
+ * [-R altroot] <pool> <newpool> [<device> ...]
+ *
+ * -n Do not split the pool, but display the resulting layout if
+ * it were to be split.
+ * -o Set property=value, or set mount options.
+ * -R Mount the split-off pool under an alternate root.
+ *
+ * Splits the named pool and gives it the new pool name. Devices to be split
+ * off may be listed, provided that no more than one device is specified
+ * per top-level vdev mirror. The newly split pool is left in an exported
+ * state unless -R is specified.
+ *
+ * Restrictions: the top-level of the pool pool must only be made up of
+ * mirrors; all devices in the pool must be healthy; no device may be
+ * undergoing a resilvering operation.
+ */
+int
+zpool_do_split(int argc, char **argv)
+{
+ char *srcpool, *newpool, *propval;
+ char *mntopts = NULL;
+ splitflags_t flags;
+ int c, ret = 0;
+ zpool_handle_t *zhp;
+ nvlist_t *config, *props = NULL;
+
+ flags.dryrun = B_FALSE;
+ flags.import = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":R:no:")) != -1) {
+ switch (c) {
+ case 'R':
+ flags.import = B_TRUE;
+ if (add_prop_list(
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
+ &props, B_TRUE) != 0) {
+ if (props)
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ break;
+ case 'n':
+ flags.dryrun = B_TRUE;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) != NULL) {
+ *propval = '\0';
+ propval++;
+ if (add_prop_list(optarg, propval,
+ &props, B_TRUE) != 0) {
+ if (props)
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ } else {
+ mntopts = optarg;
+ }
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ break;
+ }
+ }
+
+ if (!flags.import && mntopts != NULL) {
+ (void) fprintf(stderr, gettext("setting mntopts is only "
+ "valid when importing the pool\n"));
+ usage(B_FALSE);
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("Missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("Missing new pool name\n"));
+ usage(B_FALSE);
+ }
+
+ srcpool = argv[0];
+ newpool = argv[1];
+
+ argc -= 2;
+ argv += 2;
+
+ if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
+ return (1);
+
+ config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
+ if (config == NULL) {
+ ret = 1;
+ } else {
+ if (flags.dryrun) {
+ (void) printf(gettext("would create '%s' with the "
+ "following layout:\n\n"), newpool);
+ print_vdev_tree(NULL, newpool, config, 0, B_FALSE);
+ }
+ nvlist_free(config);
+ }
+
+ zpool_close(zhp);
+
+ if (ret != 0 || flags.dryrun || !flags.import)
+ return (ret);
+
+ /*
+ * The split was successful. Now we need to open the new
+ * pool and import it.
+ */
+ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
+ return (1);
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+ zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+ ret = 1;
+ (void) fprintf(stderr, gettext("Split was succssful, but "
+ "the datasets could not all be mounted\n"));
+ (void) fprintf(stderr, gettext("Try doing '%s' with a "
+ "different altroot\n"), "zpool import");
+ }
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+
+
+/*
* zpool online <pool> <device> ...
*/
int
@@ -2638,10 +2958,14 @@ zpool_do_online(int argc, char **argv)
zpool_handle_t *zhp;
int ret = 0;
vdev_state_t newstate;
+ int flags = 0;
/* check options */
- while ((c = getopt(argc, argv, "t")) != -1) {
+ while ((c = getopt(argc, argv, "et")) != -1) {
switch (c) {
+ case 'e':
+ flags |= ZFS_ONLINE_EXPAND;
+ break;
case 't':
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2669,7 +2993,7 @@ zpool_do_online(int argc, char **argv)
return (1);
for (i = 1; i < argc; i++) {
- if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) {
+ if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
if (newstate != VDEV_STATE_HEALTHY) {
(void) printf(gettext("warning: device '%s' "
"onlined, but remains in faulted state\n"),
@@ -2763,31 +3087,80 @@ zpool_do_offline(int argc, char **argv)
int
zpool_do_clear(int argc, char **argv)
{
+ int c;
int ret = 0;
+ boolean_t dryrun = B_FALSE;
+ boolean_t do_rewind = B_FALSE;
+ boolean_t xtreme_rewind = B_FALSE;
+ uint32_t rewind_policy = ZPOOL_NO_REWIND;
+ nvlist_t *policy = NULL;
zpool_handle_t *zhp;
char *pool, *device;
- if (argc < 2) {
+ /* check options */
+ while ((c = getopt(argc, argv, "FnX")) != -1) {
+ switch (c) {
+ case 'F':
+ do_rewind = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'X':
+ xtreme_rewind = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name\n"));
usage(B_FALSE);
}
- if (argc > 3) {
+ if (argc > 2) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
- pool = argv[1];
- device = argc == 3 ? argv[2] : NULL;
+ if ((dryrun || xtreme_rewind) && !do_rewind) {
+ (void) fprintf(stderr,
+ gettext("-n or -X only meaningful with -F\n"));
+ usage(B_FALSE);
+ }
+ if (dryrun)
+ rewind_policy = ZPOOL_TRY_REWIND;
+ else if (do_rewind)
+ rewind_policy = ZPOOL_DO_REWIND;
+ if (xtreme_rewind)
+ rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+ /* In future, further rewind policy choices can be passed along here */
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+ return (1);
+
+ pool = argv[0];
+ device = argc == 2 ? argv[1] : NULL;
- if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
+ if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+ nvlist_free(policy);
return (1);
+ }
- if (zpool_clear(zhp, device) != 0)
+ if (zpool_clear(zhp, device, policy) != 0)
ret = 1;
zpool_close(zhp);
+ nvlist_free(policy);
+
return (ret);
}
@@ -2812,7 +3185,7 @@ scrub_callback(zpool_handle_t *zhp, void *data)
return (1);
}
- err = zpool_scrub(zhp, cb->cb_type);
+ err = zpool_scan(zhp, cb->cb_type);
return (err != 0);
}
@@ -2828,13 +3201,13 @@ zpool_do_scrub(int argc, char **argv)
int c;
scrub_cbdata_t cb;
- cb.cb_type = POOL_SCRUB_EVERYTHING;
+ cb.cb_type = POOL_SCAN_SCRUB;
/* check options */
while ((c = getopt(argc, argv, "s")) != -1) {
switch (c) {
case 's':
- cb.cb_type = POOL_SCRUB_NONE;
+ cb.cb_type = POOL_SCAN_NONE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2862,68 +3235,119 @@ typedef struct status_cbdata {
boolean_t cb_verbose;
boolean_t cb_explain;
boolean_t cb_first;
+ boolean_t cb_dedup_stats;
} status_cbdata_t;
/*
* Print out detailed scrub status.
*/
void
-print_scrub_status(nvlist_t *nvroot)
+print_scan_status(pool_scan_stat_t *ps)
{
- vdev_stat_t *vs;
- uint_t vsc;
- time_t start, end, now;
+ time_t start, end;
+ uint64_t elapsed, mins_left, hours_left;
+ uint64_t pass_exam, examined, total;
+ uint_t rate;
double fraction_done;
- uint64_t examined, total, minutes_left, minutes_taken;
- char *scrub_type;
+ char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
- (uint64_t **)&vs, &vsc) == 0);
+ (void) printf(gettext(" scan: "));
- /*
- * If there's never been a scrub, there's not much to say.
- */
- if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+ /* If there's never been a scan, there's not much to say. */
+ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+ ps->pss_func >= POOL_SCAN_FUNCS) {
(void) printf(gettext("none requested\n"));
return;
}
- scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
- "resilver" : "scrub";
+ start = ps->pss_start_time;
+ end = ps->pss_end_time;
+ zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
- start = vs->vs_scrub_start;
- end = vs->vs_scrub_end;
- now = time(NULL);
- examined = vs->vs_scrub_examined;
- total = vs->vs_alloc;
-
- if (end != 0) {
- minutes_taken = (uint64_t)((end - start) / 60);
-
- (void) printf(gettext("%s %s after %lluh%um with %llu errors "
- "on %s"),
- scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+ assert(ps->pss_func == POOL_SCAN_SCRUB ||
+ ps->pss_func == POOL_SCAN_RESILVER);
+ /*
+ * Scan is finished or canceled.
+ */
+ if (ps->pss_state == DSS_FINISHED) {
+ uint64_t minutes_taken = (end - start) / 60;
+ char *fmt;
+
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ fmt = gettext("scrub repaired %s in %lluh%um with "
+ "%llu errors on %s");
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ fmt = gettext("resilvered %s in %lluh%um with "
+ "%llu errors on %s");
+ }
+ /* LINTED */
+ (void) printf(fmt, processed_buf,
(u_longlong_t)(minutes_taken / 60),
(uint_t)(minutes_taken % 60),
- (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+ (u_longlong_t)ps->pss_errors,
+ ctime((time_t *)&end));
+ return;
+ } else if (ps->pss_state == DSS_CANCELED) {
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub canceled on %s"),
+ ctime(&end));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver canceled on %s"),
+ ctime(&end));
+ }
return;
}
- if (examined == 0)
- examined = 1;
- if (examined > total)
- total = examined;
+ assert(ps->pss_state == DSS_SCANNING);
+
+ /*
+ * Scan is in progress.
+ */
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub in progress since %s"),
+ ctime(&start));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver in progress since %s"),
+ ctime(&start));
+ }
+ examined = ps->pss_examined ? ps->pss_examined : 1;
+ total = ps->pss_to_examine;
fraction_done = (double)examined / total;
- minutes_left = (uint64_t)((now - start) *
- (1 - fraction_done) / fraction_done / 60);
- minutes_taken = (uint64_t)((now - start) / 60);
- (void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, "
- "%lluh%um to go\n"),
- scrub_type, (u_longlong_t)(minutes_taken / 60),
- (uint_t)(minutes_taken % 60), 100 * fraction_done,
- (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+ /* elapsed time for this pass */
+ elapsed = time(NULL) - ps->pss_pass_start;
+ elapsed = elapsed ? elapsed : 1;
+ pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+ rate = pass_exam / elapsed;
+ rate = rate ? rate : 1;
+ mins_left = ((total - examined) / rate) / 60;
+ hours_left = mins_left / 60;
+
+ zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+ zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+ /*
+ * do not print estimated time if hours_left is more than 30 days
+ */
+ (void) printf(gettext(" %s scanned out of %s at %s/s"),
+ examined_buf, total_buf, rate_buf);
+ if (hours_left < (30 * 24)) {
+ (void) printf(gettext(", %lluh%um to go\n"),
+ (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+ } else {
+ (void) printf(gettext(
+ ", (scan is slow, no estimated time)\n"));
+ }
+
+ if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext(" %s resilvered, %.2f%% done\n"),
+ processed_buf, 100 * fraction_done);
+ } else if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext(" %s repaired, %.2f%% done\n"),
+ processed_buf, 100 * fraction_done);
+ }
}
static void
@@ -2974,7 +3398,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
(void) printf(gettext("\tspares\n"));
for (i = 0; i < nspares; i++) {
- name = zpool_vdev_name(g_zfs, zhp, spares[i]);
+ name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
print_status_config(zhp, name, spares[i],
namewidth, 2, B_TRUE);
free(name);
@@ -2994,13 +3418,43 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
(void) printf(gettext("\tcache\n"));
for (i = 0; i < nl2cache; i++) {
- name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
+ name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
print_status_config(zhp, name, l2cache[i],
namewidth, 2, B_FALSE);
free(name);
}
}
+static void
+print_dedup_stats(nvlist_t *config)
+{
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+ uint_t c;
+
+ /*
+ * If the pool was faulted then we may not have been able to
+ * obtain the config. Otherwise, if have anything in the dedup
+ * table continue processing the stats.
+ */
+ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0)
+ return;
+
+ (void) printf("\n");
+ (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
+ (u_longlong_t)ddo->ddo_count,
+ (u_longlong_t)ddo->ddo_dspace,
+ (u_longlong_t)ddo->ddo_mspace);
+
+ verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t **)&dds, &c) == 0);
+ verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t **)&ddh, &c) == 0);
+ zpool_dump_ddt(dds, ddh);
+}
+
/*
* Display a summary of pool status. Displays a summary such as:
*
@@ -3053,7 +3507,7 @@ status_callback(zpool_handle_t *zhp, void *data)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
@@ -3091,8 +3545,8 @@ status_callback(zpool_handle_t *zhp, void *data)
"be used because the label is missing \n\tor invalid. "
"There are insufficient replicas for the pool to "
"continue\n\tfunctioning.\n"));
- (void) printf(gettext("action: Destroy and re-create the pool "
- "from a backup source.\n"));
+ zpool_explain_recover(zpool_get_handle(zhp),
+ zpool_get_name(zhp), reason, config);
break;
case ZPOOL_STATUS_FAILING_DEV:
@@ -3116,6 +3570,16 @@ status_callback(zpool_handle_t *zhp, void *data)
"replace'.\n"));
break;
+ case ZPOOL_STATUS_REMOVED_DEV:
+ (void) printf(gettext("status: One or more devices has "
+ "been removed by the administrator.\n\tSufficient "
+ "replicas exist for the pool to continue functioning in "
+ "a\n\tdegraded state.\n"));
+ (void) printf(gettext("action: Online the device using "
+ "'zpool online' or replace the device with\n\t'zpool "
+ "replace'.\n"));
+ break;
+
case ZPOOL_STATUS_RESILVERING:
(void) printf(gettext("status: One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
@@ -3136,8 +3600,8 @@ status_callback(zpool_handle_t *zhp, void *data)
case ZPOOL_STATUS_CORRUPT_POOL:
(void) printf(gettext("status: The pool metadata is corrupted "
"and the pool cannot be opened.\n"));
- (void) printf(gettext("action: Destroy and re-create the pool "
- "from a backup source.\n"));
+ zpool_explain_recover(zpool_get_handle(zhp),
+ zpool_get_name(zhp), reason, config);
break;
case ZPOOL_STATUS_VERSION_OLDER:
@@ -3213,10 +3677,11 @@ status_callback(zpool_handle_t *zhp, void *data)
uint64_t nerr;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ pool_scan_stat_t *ps = NULL;
-
- (void) printf(gettext(" scrub: "));
- print_scrub_status(nvroot);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+ print_scan_status(ps);
namewidth = max_width(zhp, nvroot, 0, 0);
if (namewidth < 10)
@@ -3272,6 +3737,9 @@ status_callback(zpool_handle_t *zhp, void *data)
else
print_error_log(zhp);
}
+
+ if (cbp->cb_dedup_stats)
+ print_dedup_stats(config);
} else {
(void) printf(gettext("config: The configuration cannot be "
"determined.\n"));
@@ -3281,10 +3749,12 @@ status_callback(zpool_handle_t *zhp, void *data)
}
/*
- * zpool status [-vx] [pool] ...
+ * zpool status [-vx] [-T d|u] [pool] ... [interval [count]]
*
* -v Display complete error logs
* -x Display only pools with potential problems
+ * -D Display dedup status (undocumented)
+ * -T Display a timestamp in date(1) or Unix format
*
* Describes the health status of all pools or some subset.
*/
@@ -3293,10 +3763,11 @@ zpool_do_status(int argc, char **argv)
{
int c;
int ret;
+ unsigned long interval = 0, count = 0;
status_cbdata_t cb = { 0 };
/* check options */
- while ((c = getopt(argc, argv, "vx")) != -1) {
+ while ((c = getopt(argc, argv, "vxDT:")) != -1) {
switch (c) {
case 'v':
cb.cb_verbose = B_TRUE;
@@ -3304,6 +3775,12 @@ zpool_do_status(int argc, char **argv)
case 'x':
cb.cb_explain = B_TRUE;
break;
+ case 'D':
+ cb.cb_dedup_stats = B_TRUE;
+ break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@@ -3314,19 +3791,38 @@ zpool_do_status(int argc, char **argv)
argc -= optind;
argv += optind;
- cb.cb_first = B_TRUE;
+ get_interval_count(&argc, argv, &interval, &count);
if (argc == 0)
cb.cb_allpools = B_TRUE;
- ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb);
+ cb.cb_first = B_TRUE;
- if (argc == 0 && cb.cb_count == 0)
- (void) printf(gettext("no pools available\n"));
- else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
- (void) printf(gettext("all pools are healthy\n"));
+ for (;;) {
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
- return (ret);
+ ret = for_each_pool(argc, argv, B_TRUE, NULL,
+ status_callback, &cb);
+
+ if (argc == 0 && cb.cb_count == 0)
+ (void) printf(gettext("no pools available\n"));
+ else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+ (void) printf(gettext("all pools are healthy\n"));
+
+ if (ret != 0)
+ return (ret);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) sleep(interval);
+ }
+
+ return (0);
}
typedef struct upgrade_cbdata {
@@ -3489,7 +3985,7 @@ zpool_do_upgrade(int argc, char **argv)
/* check options */
- while ((c = getopt(argc, argv, "avV:")) != -1) {
+ while ((c = getopt(argc, argv, ":avV:")) != -1) {
switch (c) {
case 'a':
cb.cb_all = B_TRUE;
@@ -3506,6 +4002,11 @@ zpool_do_upgrade(int argc, char **argv)
usage(B_FALSE);
}
break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@@ -3568,11 +4069,25 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 13 snapused property\n"));
(void) printf(gettext(" 14 passthrough-x aclinherit\n"));
(void) printf(gettext(" 15 user/group space accounting\n"));
- (void) printf(gettext("For more information on a particular "
- "version, including supported releases, see:\n\n"));
- (void) printf("http://www.opensolaris.org/os/community/zfs/"
- "version/N\n\n");
- (void) printf(gettext("Where 'N' is the version number.\n"));
+ (void) printf(gettext(" 16 stmf property support\n"));
+ (void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
+ (void) printf(gettext(" 18 Snapshot user holds\n"));
+ (void) printf(gettext(" 19 Log device removal\n"));
+ (void) printf(gettext(" 20 Compression using zle "
+ "(zero-length encoding)\n"));
+ (void) printf(gettext(" 21 Deduplication\n"));
+ (void) printf(gettext(" 22 Received properties\n"));
+ (void) printf(gettext(" 23 Slim ZIL\n"));
+ (void) printf(gettext(" 24 System attributes\n"));
+ (void) printf(gettext(" 25 Improved scrub stats\n"));
+ (void) printf(gettext(" 26 Improved snapshot deletion "
+ "performance\n"));
+ (void) printf(gettext(" 27 Improved snapshot creation "
+ "performance\n"));
+ (void) printf(gettext(" 28 Multiple vdev replacements\n"));
+ (void) printf(gettext("\nFor more information on a particular "
+ "version, including supported releases,\n"));
+ (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
} else if (argc == 0) {
int notfound;
@@ -3624,47 +4139,6 @@ typedef struct hist_cbdata {
int internal;
} hist_cbdata_t;
-char *hist_event_table[LOG_END] = {
- "invalid event",
- "pool create",
- "vdev add",
- "pool remove",
- "pool destroy",
- "pool export",
- "pool import",
- "vdev attach",
- "vdev replace",
- "vdev detach",
- "vdev online",
- "vdev offline",
- "vdev upgrade",
- "pool clear",
- "pool scrub",
- "pool property set",
- "create",
- "clone",
- "destroy",
- "destroy_begin_sync",
- "inherit",
- "property set",
- "quota set",
- "permission update",
- "permission remove",
- "permission who remove",
- "promote",
- "receive",
- "rename",
- "reservation set",
- "replay_inc_sync",
- "replay_full_sync",
- "rollback",
- "snapshot",
- "filesystem version upgrade",
- "refquota set",
- "refreservation set",
- "pool scrub done",
-};
-
/*
* Print out the command history for a specific pool.
*/
@@ -3722,7 +4196,7 @@ get_history_one(zpool_handle_t *zhp, void *data)
(void) snprintf(internalstr,
sizeof (internalstr),
"[internal %s txg:%lld] %s",
- hist_event_table[ievent], txg,
+ zfs_history_event_names[ievent], txg,
pathstr);
cmdstr = internalstr;
}
@@ -3834,7 +4308,8 @@ get_callback(zpool_handle_t *zhp, void *data)
continue;
zprop_print_one_property(zpool_get_name(zhp), cbp,
- zpool_prop_to_name(pl->pl_prop), value, srctype, NULL);
+ zpool_prop_to_name(pl->pl_prop), value, srctype, NULL,
+ NULL);
}
return (0);
}
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
index f44da4ff60f5..c7a002efb17c 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <errno.h>
#include <libgen.h>
#include <libintl.h>
@@ -51,22 +49,6 @@ safe_malloc(size_t size)
}
/*
- * Same as above, but for strdup()
- */
-char *
-safe_strdup(const char *str)
-{
- char *ret;
-
- if ((ret = strdup(str)) == NULL) {
- (void) fprintf(stderr, "internal error: out of memory\n");
- exit(1);
- }
-
- return (ret);
-}
-
-/*
* Display an out of memory error message and abort the current program.
*/
void
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
index e82f3202af2a..134c730fcf8e 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef ZPOOL_UTIL_H
@@ -37,7 +36,6 @@ extern "C" {
* Basic utility functions
*/
void *safe_malloc(size_t);
-char *safe_strdup(const char *);
void zpool_no_memory(void);
uint_t num_logs(nvlist_t *nv);
@@ -46,7 +44,9 @@ uint_t num_logs(nvlist_t *nv);
*/
nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
- boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+ boolean_t replacing, boolean_t dryrun, int argc, char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+ nvlist_t *props, splitflags_t flags, int argc, char **argv);
/*
* Pool list functions
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
index 35a636c91128..5ffd39ac8fe6 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -66,6 +65,7 @@
#include <fcntl.h>
#include <libintl.h>
#include <libnvpair.h>
+#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
@@ -77,6 +77,10 @@
#include "zpool_util.h"
+#define DISK_ROOT "/dev/dsk"
+#define RDISK_ROOT "/dev/rdsk"
+#define BACKUP_SLICE "s2"
+
/*
* For any given vdev specification, we can have multiple errors. The
* vdev_error() function keeps track of whether we have seen an error yet, and
@@ -107,6 +111,170 @@ vdev_error(const char *fmt, ...)
va_end(ap);
}
+#ifdef sun
+static void
+libdiskmgt_error(int error)
+{
+ /*
+ * ENXIO/ENODEV is a valid error message if the device doesn't live in
+ * /dev/dsk. Don't bother printing an error message in this case.
+ */
+ if (error == ENXIO || error == ENODEV)
+ return;
+
+ (void) fprintf(stderr, gettext("warning: device in use checking "
+ "failed: %s\n"), strerror(error));
+}
+
+/*
+ * Validate a device, passing the bulk of the work off to libdiskmgt.
+ */
+static int
+check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
+{
+ char *msg;
+ int error = 0;
+ dm_who_type_t who;
+
+ if (force)
+ who = DM_WHO_ZPOOL_FORCE;
+ else if (isspare)
+ who = DM_WHO_ZPOOL_SPARE;
+ else
+ who = DM_WHO_ZPOOL;
+
+ if (dm_inuse((char *)path, &msg, who, &error) || error) {
+ if (error != 0) {
+ libdiskmgt_error(error);
+ return (0);
+ } else {
+ vdev_error("%s", msg);
+ free(msg);
+ return (-1);
+ }
+ }
+
+ /*
+ * If we're given a whole disk, ignore overlapping slices since we're
+ * about to label it anyway.
+ */
+ error = 0;
+ if (!wholedisk && !force &&
+ (dm_isoverlapping((char *)path, &msg, &error) || error)) {
+ if (error == 0) {
+ /* dm_isoverlapping returned -1 */
+ vdev_error(gettext("%s overlaps with %s\n"), path, msg);
+ free(msg);
+ return (-1);
+ } else if (error != ENODEV) {
+ /* libdiskmgt's devcache only handles physical drives */
+ libdiskmgt_error(error);
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+
+/*
+ * Validate a whole disk. Iterate over all slices on the disk and make sure
+ * that none is in use by calling check_slice().
+ */
+static int
+check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
+{
+ dm_descriptor_t *drive, *media, *slice;
+ int err = 0;
+ int i;
+ int ret;
+
+ /*
+ * Get the drive associated with this disk. This should never fail,
+ * because we already have an alias handle open for the device.
+ */
+ if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
+ &err)) == NULL || *drive == NULL) {
+ if (err)
+ libdiskmgt_error(err);
+ return (0);
+ }
+
+ if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
+ &err)) == NULL) {
+ dm_free_descriptors(drive);
+ if (err)
+ libdiskmgt_error(err);
+ return (0);
+ }
+
+ dm_free_descriptors(drive);
+
+ /*
+ * It is possible that the user has specified a removable media drive,
+ * and the media is not present.
+ */
+ if (*media == NULL) {
+ dm_free_descriptors(media);
+ vdev_error(gettext("'%s' has no media in drive\n"), name);
+ return (-1);
+ }
+
+ if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
+ &err)) == NULL) {
+ dm_free_descriptors(media);
+ if (err)
+ libdiskmgt_error(err);
+ return (0);
+ }
+
+ dm_free_descriptors(media);
+
+ ret = 0;
+
+ /*
+ * Iterate over all slices and report any errors. We don't care about
+ * overlapping slices because we are using the whole disk.
+ */
+ for (i = 0; slice[i] != NULL; i++) {
+ char *name = dm_get_name(slice[i], &err);
+
+ if (check_slice(name, force, B_TRUE, isspare) != 0)
+ ret = -1;
+
+ dm_free_name(name);
+ }
+
+ dm_free_descriptors(slice);
+ return (ret);
+}
+
+/*
+ * Validate a device.
+ */
+static int
+check_device(const char *path, boolean_t force, boolean_t isspare)
+{
+ dm_descriptor_t desc;
+ int err;
+ char *dev;
+
+ /*
+ * For whole disks, libdiskmgt does not include the leading dev path.
+ */
+ dev = strrchr(path, '/');
+ assert(dev != NULL);
+ dev++;
+ if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
+ err = check_disk(path, desc, force, isspare);
+ dm_free_descriptor(desc);
+ return (err);
+ }
+
+ return (check_slice(path, force, B_FALSE, isspare));
+}
+#endif /* sun */
+
/*
* Check that a file is valid. All we can do in this case is check that it's
* not in use by another pool, and not in use by swap.
@@ -121,7 +289,7 @@ check_file(const char *file, boolean_t force, boolean_t isspare)
pool_state_t state;
boolean_t inuse;
-#if 0
+#ifdef sun
if (dm_inuse_swap(file, &err)) {
if (err)
libdiskmgt_error(err);
@@ -185,7 +353,7 @@ check_file(const char *file, boolean_t force, boolean_t isspare)
}
static int
-check_provider(const char *name, boolean_t force, boolean_t isspare)
+check_device(const char *name, boolean_t force, boolean_t isspare)
{
char path[MAXPATHLEN];
@@ -206,24 +374,44 @@ check_provider(const char *name, boolean_t force, boolean_t isspare)
* it isn't.
*/
static boolean_t
-is_whole_disk(const char *name)
+is_whole_disk(const char *arg)
{
+#ifdef sun
+ struct dk_gpt *label;
+ int fd;
+ char path[MAXPATHLEN];
+
+ (void) snprintf(path, sizeof (path), "%s%s%s",
+ RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
+ if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
+ return (B_FALSE);
+ if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
+ (void) close(fd);
+ return (B_FALSE);
+ }
+ efi_free(label);
+ (void) close(fd);
+ return (B_TRUE);
+#else
int fd;
- fd = g_open(name, 0);
+ fd = g_open(arg, 0);
if (fd >= 0) {
g_close(fd);
return (B_TRUE);
}
return (B_FALSE);
+#endif
}
/*
- * Create a leaf vdev. Determine if this is a GEOM provider.
- * Valid forms for a leaf vdev are:
+ * Create a leaf vdev. Determine if this is a file or a device. If it's a
+ * device, fill in the device id to make a complete nvlist. Valid forms for a
+ * leaf vdev are:
*
- * /dev/xxx Complete path to a GEOM provider
- * xxx Shorthand for /dev/xxx
+ * /dev/dsk/xxx Complete disk path
+ * /xxx Full path to file
+ * xxx Shorthand for /dev/dsk/xxx
*/
static nvlist_t *
make_leaf_vdev(const char *arg, uint64_t is_log)
@@ -290,10 +478,18 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
}
}
+#ifdef __FreeBSD__
+ if (S_ISCHR(statbuf.st_mode)) {
+ statbuf.st_mode &= ~S_IFCHR;
+ statbuf.st_mode |= S_IFBLK;
+ wholedisk = B_FALSE;
+ }
+#endif
+
/*
* Determine whether this is a device or a file.
*/
- if (wholedisk) {
+ if (wholedisk || S_ISBLK(statbuf.st_mode)) {
type = VDEV_TYPE_DISK;
} else if (S_ISREG(statbuf.st_mode)) {
type = VDEV_TYPE_FILE;
@@ -314,12 +510,12 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
if (strcmp(type, VDEV_TYPE_DISK) == 0)
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
- (uint64_t)B_FALSE) == 0);
+ (uint64_t)wholedisk) == 0);
/*
* For a whole disk, defer getting its devid until after labeling it.
*/
- if (1 || (S_ISBLK(statbuf.st_mode) && !wholedisk)) {
+ if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
/*
* Get the devid for the device.
*/
@@ -527,16 +723,14 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
*/
if ((fd = open(path, O_RDONLY)) >= 0) {
err = fstat64(fd, &statbuf);
- if (err == 0 &&
- S_ISCHR(statbuf.st_mode)) {
- err = ioctl(fd, DIOCGMEDIASIZE,
- &statbuf.st_size);
- }
(void) close(fd);
} else {
err = stat64(path, &statbuf);
}
- if (err != 0 || statbuf.st_size == 0)
+
+ if (err != 0 ||
+ statbuf.st_size == 0 ||
+ statbuf.st_size == MAXOFFSET_T)
continue;
size = statbuf.st_size;
@@ -714,6 +908,112 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
return (ret);
}
+#ifdef sun
+/*
+ * Go through and find any whole disks in the vdev specification, labelling them
+ * as appropriate. When constructing the vdev spec, we were unable to open this
+ * device in order to provide a devid. Now that we have labelled the disk and
+ * know that slice 0 is valid, we can construct the devid now.
+ *
+ * If the disk was already labeled with an EFI label, we will have gotten the
+ * devid already (because we were able to open the whole disk). Otherwise, we
+ * need to get the devid after we label the disk.
+ */
+static int
+make_disks(zpool_handle_t *zhp, nvlist_t *nv)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *type, *path, *diskname;
+ char buf[MAXPATHLEN];
+ uint64_t wholedisk;
+ int fd;
+ int ret;
+ ddi_devid_t devid;
+ char *minor = NULL, *devid_str = NULL;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+
+ if (strcmp(type, VDEV_TYPE_DISK) != 0)
+ return (0);
+
+ /*
+ * We have a disk device. Get the path to the device
+ * and see if it's a whole disk by appending the backup
+ * slice and stat()ing the device.
+ */
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk) != 0 || !wholedisk)
+ return (0);
+
+ diskname = strrchr(path, '/');
+ assert(diskname != NULL);
+ diskname++;
+ if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
+ return (-1);
+
+ /*
+ * Fill in the devid, now that we've labeled the disk.
+ */
+ (void) snprintf(buf, sizeof (buf), "%ss0", path);
+ if ((fd = open(buf, O_RDONLY)) < 0) {
+ (void) fprintf(stderr,
+ gettext("cannot open '%s': %s\n"),
+ buf, strerror(errno));
+ return (-1);
+ }
+
+ if (devid_get(fd, &devid) == 0) {
+ if (devid_get_minor_name(fd, &minor) == 0 &&
+ (devid_str = devid_str_encode(devid, minor)) !=
+ NULL) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_DEVID, devid_str) == 0);
+ }
+ if (devid_str != NULL)
+ devid_str_free(devid_str);
+ if (minor != NULL)
+ devid_str_free(minor);
+ devid_free(devid);
+ }
+
+ /*
+ * Update the path to refer to the 's0' slice. The presence of
+ * the 'whole_disk' field indicates to the CLI that we should
+ * chop off the slice number when displaying the device in
+ * future output.
+ */
+ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
+
+ (void) close(fd);
+
+ return (0);
+ }
+
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ return (0);
+}
+#endif /* sun */
+
/*
* Determine if the given path is a hot spare within the given configuration.
*/
@@ -742,8 +1042,8 @@ is_spare(nvlist_t *config, const char *path)
return (B_FALSE);
}
free(name);
-
(void) close(fd);
+
verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
nvlist_free(label);
@@ -767,8 +1067,8 @@ is_spare(nvlist_t *config, const char *path)
* the majority of this task.
*/
static int
-check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
- int isspare)
+check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+ boolean_t replacing, boolean_t isspare)
{
nvlist_t **child;
uint_t c, children;
@@ -789,14 +1089,22 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
* hot spare within the same pool. If so, we allow it
* regardless of what libdiskmgt or zpool_in_use() says.
*/
- if (isreplacing) {
- (void) strlcpy(buf, path, sizeof (buf));
+ if (replacing) {
+#ifdef sun
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk) == 0 && wholedisk)
+ (void) snprintf(buf, sizeof (buf), "%ss0",
+ path);
+ else
+#endif
+ (void) strlcpy(buf, path, sizeof (buf));
+
if (is_spare(config, buf))
return (0);
}
if (strcmp(type, VDEV_TYPE_DISK) == 0)
- ret = check_provider(path, force, isspare);
+ ret = check_device(path, force, isspare);
if (strcmp(type, VDEV_TYPE_FILE) == 0)
ret = check_file(path, force, isspare);
@@ -806,41 +1114,56 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_FALSE)) != 0)
+ replacing, B_FALSE)) != 0)
return (ret);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0)
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_TRUE)) != 0)
+ replacing, B_TRUE)) != 0)
return (ret);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0)
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_FALSE)) != 0)
+ replacing, B_FALSE)) != 0)
return (ret);
return (0);
}
static const char *
-is_grouping(const char *type, int *mindev)
+is_grouping(const char *type, int *mindev, int *maxdev)
{
- if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
- if (mindev != NULL)
- *mindev = 2;
- return (VDEV_TYPE_RAIDZ);
- }
+ if (strncmp(type, "raidz", 5) == 0) {
+ const char *p = type + 5;
+ char *end;
+ long nparity;
+
+ if (*p == '\0') {
+ nparity = 1;
+ } else if (*p == '0') {
+ return (NULL); /* no zero prefixes allowed */
+ } else {
+ errno = 0;
+ nparity = strtol(p, &end, 10);
+ if (errno != 0 || nparity < 1 || nparity >= 255 ||
+ *end != '\0')
+ return (NULL);
+ }
- if (strcmp(type, "raidz2") == 0) {
if (mindev != NULL)
- *mindev = 3;
+ *mindev = nparity + 1;
+ if (maxdev != NULL)
+ *maxdev = 255;
return (VDEV_TYPE_RAIDZ);
}
+ if (maxdev != NULL)
+ *maxdev = INT_MAX;
+
if (strcmp(type, "mirror") == 0) {
if (mindev != NULL)
*mindev = 2;
@@ -878,7 +1201,7 @@ nvlist_t *
construct_spec(int argc, char **argv)
{
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
- int t, toplevels, mindev, nspares, nlogs, nl2cache;
+ int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
const char *type;
uint64_t is_log;
boolean_t seen_logs;
@@ -900,7 +1223,7 @@ construct_spec(int argc, char **argv)
* If it's a mirror or raidz, the subsequent arguments are
* its leaves -- until we encounter the next mirror or raidz.
*/
- if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+ if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
nvlist_t **child = NULL;
int c, children = 0;
@@ -957,7 +1280,7 @@ construct_spec(int argc, char **argv)
}
for (c = 1; c < argc; c++) {
- if (is_grouping(argv[c], NULL) != NULL)
+ if (is_grouping(argv[c], NULL, NULL) != NULL)
break;
children++;
child = realloc(child,
@@ -977,6 +1300,13 @@ construct_spec(int argc, char **argv)
return (NULL);
}
+ if (children > maxdev) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: %s supports no more than "
+ "%d devices\n"), argv[0], maxdev);
+ return (NULL);
+ }
+
argc -= c;
argv += c;
@@ -1071,6 +1401,54 @@ construct_spec(int argc, char **argv)
return (nvroot);
}
+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+ splitflags_t flags, int argc, char **argv)
+{
+ nvlist_t *newroot = NULL, **child;
+ uint_t c, children;
+
+ if (argc > 0) {
+ if ((newroot = construct_spec(argc, argv)) == NULL) {
+ (void) fprintf(stderr, gettext("Unable to build a "
+ "pool from the specified devices\n"));
+ return (NULL);
+ }
+
+#ifdef sun
+ if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+#endif
+
+ /* avoid any tricks in the spec */
+ verify(nvlist_lookup_nvlist_array(newroot,
+ ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+ for (c = 0; c < children; c++) {
+ char *path;
+ const char *type;
+ int min, max;
+
+ verify(nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_PATH, &path) == 0);
+ if ((type = is_grouping(path, &min, &max)) != NULL) {
+ (void) fprintf(stderr, gettext("Cannot use "
+ "'%s' as a device for splitting\n"), type);
+ nvlist_free(newroot);
+ return (NULL);
+ }
+ }
+ }
+
+ if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+ if (newroot != NULL)
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ return (newroot);
+}
/*
* Get and validate the contents of the given vdev specification. This ensures
@@ -1084,7 +1462,7 @@ construct_spec(int argc, char **argv)
*/
nvlist_t *
make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
- boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
+ boolean_t replacing, boolean_t dryrun, int argc, char **argv)
{
nvlist_t *newroot;
nvlist_t *poolconfig = NULL;
@@ -1107,8 +1485,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
* uses (such as a dedicated dump device) that even '-f' cannot
* override.
*/
- if (check_in_use(poolconfig, newroot, force, isreplacing,
- B_FALSE) != 0) {
+ if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
nvlist_free(newroot);
return (NULL);
}
@@ -1123,5 +1500,15 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
return (NULL);
}
+#ifdef sun
+ /*
+ * Run through the vdev specification and label any whole disks found.
+ */
+ if (!dryrun && make_disks(zhp, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+#endif
+
return (newroot);
}
diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1 b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1
new file mode 100644
index 000000000000..9e11948becf4
--- /dev/null
+++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1
@@ -0,0 +1,67 @@
+'\" te
+.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved
+.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with
+.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
+.TH zstreamdump 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands"
+.SH NAME
+zstreamdump \- filter data in zfs send stream
+.SH SYNOPSIS
+.LP
+.nf
+\fBzstreamdump\fR [\fB-C\fR] [\fB-v\fR]
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The \fBzstreamdump\fR utility reads from the output of the \fBzfs send\fR command, then displays headers and some statistics from that output. See \fBzfs\fR(1M).
+.SH OPTIONS
+.sp
+.LP
+The following options are supported:
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-C\fR\fR
+.ad
+.sp .6
+.RS 4n
+Suppress the validation of checksums.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Verbose. Dump all headers, not only begin and end headers.
+.RE
+
+.SH ATTRIBUTES
+.sp
+.LP
+See \fBattributes\fR(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+tab() box;
+cw(2.75i) |cw(2.75i)
+lw(2.75i) |lw(2.75i)
+.
+ATTRIBUTE TYPEATTRIBUTE VALUE
+_
+AvailabilitySUNWzfsu
+_
+Interface StabilityUncommitted
+.TE
+
+.SH SEE ALSO
+.sp
+.LP
+\fBzfs\fR(1M), \fBattributes\fR(5)
diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
new file mode 100644
index 000000000000..df23cc1e5a38
--- /dev/null
+++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
@@ -0,0 +1,429 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
+#include <zfs_fletcher.h>
+
+uint64_t drr_record_count[DRR_NUMTYPES];
+uint64_t total_write_size = 0;
+uint64_t total_stream_len = 0;
+FILE *send_stream = 0;
+boolean_t do_byteswap = B_FALSE;
+boolean_t do_cksum = B_TRUE;
+#define INITIAL_BUFLEN (1<<20)
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] < file\n");
+ (void) fprintf(stderr, "\t -v -- verbose\n");
+ (void) fprintf(stderr, "\t -C -- suppress checksum verification\n");
+ exit(1);
+}
+
+/*
+ * ssread - send stream read.
+ *
+ * Read while computing incremental checksum
+ */
+
+static size_t
+ssread(void *buf, size_t len, zio_cksum_t *cksum)
+{
+ size_t outlen;
+
+ if ((outlen = fread(buf, len, 1, send_stream)) == 0)
+ return (0);
+
+ if (do_cksum && cksum) {
+ if (do_byteswap)
+ fletcher_4_incremental_byteswap(buf, len, cksum);
+ else
+ fletcher_4_incremental_native(buf, len, cksum);
+ }
+ total_stream_len += len;
+ return (outlen);
+}
+
+int
+main(int argc, char *argv[])
+{
+ char *buf = malloc(INITIAL_BUFLEN);
+ dmu_replay_record_t thedrr;
+ dmu_replay_record_t *drr = &thedrr;
+ struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+ struct drr_end *drre = &thedrr.drr_u.drr_end;
+ struct drr_object *drro = &thedrr.drr_u.drr_object;
+ struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects;
+ struct drr_write *drrw = &thedrr.drr_u.drr_write;
+ struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
+ struct drr_free *drrf = &thedrr.drr_u.drr_free;
+ struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+ char c;
+ boolean_t verbose = B_FALSE;
+ boolean_t first = B_TRUE;
+ int err;
+ zio_cksum_t zc = { 0 };
+ zio_cksum_t pcksum = { 0 };
+
+ while ((c = getopt(argc, argv, ":vC")) != -1) {
+ switch (c) {
+ case 'C':
+ do_cksum = B_FALSE;
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr,
+ "missing argument for '%c' option\n", optopt);
+ usage();
+ break;
+ case '?':
+ (void) fprintf(stderr, "invalid option '%c'\n",
+ optopt);
+ usage();
+ }
+ }
+
+ if (isatty(STDIN_FILENO)) {
+ (void) fprintf(stderr,
+ "Error: Backup stream can not be read "
+ "from a terminal.\n"
+ "You must redirect standard input.\n");
+ exit(1);
+ }
+
+ send_stream = stdin;
+ pcksum = zc;
+ while (ssread(drr, sizeof (dmu_replay_record_t), &zc)) {
+
+ if (first) {
+ if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ do_byteswap = B_TRUE;
+ if (do_cksum) {
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ /*
+ * recalculate header checksum now
+ * that we know it needs to be
+ * byteswapped.
+ */
+ fletcher_4_incremental_byteswap(drr,
+ sizeof (dmu_replay_record_t), &zc);
+ }
+ } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) {
+ (void) fprintf(stderr, "Invalid stream "
+ "(bad magic number)\n");
+ exit(1);
+ }
+ first = B_FALSE;
+ }
+ if (do_byteswap) {
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen =
+ BSWAP_32(drr->drr_payloadlen);
+ }
+
+ /*
+ * At this point, the leading fields of the replay record
+ * (drr_type and drr_payloadlen) have been byte-swapped if
+ * necessary, but the rest of the data structure (the
+ * union of type-specific structures) is still in its
+ * original state.
+ */
+ if (drr->drr_type >= DRR_NUMTYPES) {
+ (void) printf("INVALID record found: type 0x%x\n",
+ drr->drr_type);
+ (void) printf("Aborting.\n");
+ exit(1);
+ }
+
+ drr_record_count[drr->drr_type]++;
+
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ if (do_byteswap) {
+ drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+ drrb->drr_versioninfo =
+ BSWAP_64(drrb->drr_versioninfo);
+ drrb->drr_creation_time =
+ BSWAP_64(drrb->drr_creation_time);
+ drrb->drr_type = BSWAP_32(drrb->drr_type);
+ drrb->drr_flags = BSWAP_32(drrb->drr_flags);
+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+ drrb->drr_fromguid =
+ BSWAP_64(drrb->drr_fromguid);
+ }
+
+ (void) printf("BEGIN record\n");
+ (void) printf("\thdrtype = %lld\n",
+ DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo));
+ (void) printf("\tfeatures = %llx\n",
+ DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo));
+ (void) printf("\tmagic = %llx\n",
+ (u_longlong_t)drrb->drr_magic);
+ (void) printf("\tcreation_time = %llx\n",
+ (u_longlong_t)drrb->drr_creation_time);
+ (void) printf("\ttype = %u\n", drrb->drr_type);
+ (void) printf("\tflags = 0x%x\n", drrb->drr_flags);
+ (void) printf("\ttoguid = %llx\n",
+ (u_longlong_t)drrb->drr_toguid);
+ (void) printf("\tfromguid = %llx\n",
+ (u_longlong_t)drrb->drr_fromguid);
+ (void) printf("\ttoname = %s\n", drrb->drr_toname);
+ if (verbose)
+ (void) printf("\n");
+
+ if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) {
+ nvlist_t *nv;
+ int sz = drr->drr_payloadlen;
+
+ if (sz > 1<<20) {
+ free(buf);
+ buf = malloc(sz);
+ }
+ (void) ssread(buf, sz, &zc);
+ if (ferror(send_stream))
+ perror("fread");
+ err = nvlist_unpack(buf, sz, &nv, 0);
+ if (err)
+ perror(strerror(err));
+ nvlist_print(stdout, nv);
+ nvlist_free(nv);
+ }
+ break;
+
+ case DRR_END:
+ if (do_byteswap) {
+ drre->drr_checksum.zc_word[0] =
+ BSWAP_64(drre->drr_checksum.zc_word[0]);
+ drre->drr_checksum.zc_word[1] =
+ BSWAP_64(drre->drr_checksum.zc_word[1]);
+ drre->drr_checksum.zc_word[2] =
+ BSWAP_64(drre->drr_checksum.zc_word[2]);
+ drre->drr_checksum.zc_word[3] =
+ BSWAP_64(drre->drr_checksum.zc_word[3]);
+ }
+ /*
+ * We compare against the *previous* checksum
+ * value, because the stored checksum is of
+ * everything before the DRR_END record.
+ */
+ if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum,
+ pcksum)) {
+ (void) printf("Expected checksum differs from "
+ "checksum in stream.\n");
+ (void) printf("Expected checksum = "
+ "%llx/%llx/%llx/%llx\n",
+ pcksum.zc_word[0],
+ pcksum.zc_word[1],
+ pcksum.zc_word[2],
+ pcksum.zc_word[3]);
+ }
+ (void) printf("END checksum = %llx/%llx/%llx/%llx\n",
+ drre->drr_checksum.zc_word[0],
+ drre->drr_checksum.zc_word[1],
+ drre->drr_checksum.zc_word[2],
+ drre->drr_checksum.zc_word[3]);
+
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ break;
+
+ case DRR_OBJECT:
+ if (do_byteswap) {
+ drro->drr_object = BSWAP_64(drro->drr_object);
+ drro->drr_type = BSWAP_32(drro->drr_type);
+ drro->drr_bonustype =
+ BSWAP_32(drro->drr_bonustype);
+ drro->drr_blksz = BSWAP_32(drro->drr_blksz);
+ drro->drr_bonuslen =
+ BSWAP_32(drro->drr_bonuslen);
+ drro->drr_toguid = BSWAP_64(drro->drr_toguid);
+ }
+ if (verbose) {
+ (void) printf("OBJECT object = %llu type = %u "
+ "bonustype = %u blksz = %u bonuslen = %u\n",
+ (u_longlong_t)drro->drr_object,
+ drro->drr_type,
+ drro->drr_bonustype,
+ drro->drr_blksz,
+ drro->drr_bonuslen);
+ }
+ if (drro->drr_bonuslen > 0) {
+ (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen,
+ 8), &zc);
+ }
+ break;
+
+ case DRR_FREEOBJECTS:
+ if (do_byteswap) {
+ drrfo->drr_firstobj =
+ BSWAP_64(drrfo->drr_firstobj);
+ drrfo->drr_numobjs =
+ BSWAP_64(drrfo->drr_numobjs);
+ drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid);
+ }
+ if (verbose) {
+ (void) printf("FREEOBJECTS firstobj = %llu "
+ "numobjs = %llu\n",
+ (u_longlong_t)drrfo->drr_firstobj,
+ (u_longlong_t)drrfo->drr_numobjs);
+ }
+ break;
+
+ case DRR_WRITE:
+ if (do_byteswap) {
+ drrw->drr_object = BSWAP_64(drrw->drr_object);
+ drrw->drr_type = BSWAP_32(drrw->drr_type);
+ drrw->drr_offset = BSWAP_64(drrw->drr_offset);
+ drrw->drr_length = BSWAP_64(drrw->drr_length);
+ drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
+ drrw->drr_key.ddk_prop =
+ BSWAP_64(drrw->drr_key.ddk_prop);
+ }
+ if (verbose) {
+ (void) printf("WRITE object = %llu type = %u "
+ "checksum type = %u\n"
+ "offset = %llu length = %llu "
+ "props = %llx\n",
+ (u_longlong_t)drrw->drr_object,
+ drrw->drr_type,
+ drrw->drr_checksumtype,
+ (u_longlong_t)drrw->drr_offset,
+ (u_longlong_t)drrw->drr_length,
+ (u_longlong_t)drrw->drr_key.ddk_prop);
+ }
+ (void) ssread(buf, drrw->drr_length, &zc);
+ total_write_size += drrw->drr_length;
+ break;
+
+ case DRR_WRITE_BYREF:
+ if (do_byteswap) {
+ drrwbr->drr_object =
+ BSWAP_64(drrwbr->drr_object);
+ drrwbr->drr_offset =
+ BSWAP_64(drrwbr->drr_offset);
+ drrwbr->drr_length =
+ BSWAP_64(drrwbr->drr_length);
+ drrwbr->drr_toguid =
+ BSWAP_64(drrwbr->drr_toguid);
+ drrwbr->drr_refguid =
+ BSWAP_64(drrwbr->drr_refguid);
+ drrwbr->drr_refobject =
+ BSWAP_64(drrwbr->drr_refobject);
+ drrwbr->drr_refoffset =
+ BSWAP_64(drrwbr->drr_refoffset);
+ drrwbr->drr_key.ddk_prop =
+ BSWAP_64(drrwbr->drr_key.ddk_prop);
+ }
+ if (verbose) {
+ (void) printf("WRITE_BYREF object = %llu "
+ "checksum type = %u props = %llx\n"
+ "offset = %llu length = %llu\n"
+ "toguid = %llx refguid = %llx\n"
+ "refobject = %llu refoffset = %llu\n",
+ (u_longlong_t)drrwbr->drr_object,
+ drrwbr->drr_checksumtype,
+ (u_longlong_t)drrwbr->drr_key.ddk_prop,
+ (u_longlong_t)drrwbr->drr_offset,
+ (u_longlong_t)drrwbr->drr_length,
+ (u_longlong_t)drrwbr->drr_toguid,
+ (u_longlong_t)drrwbr->drr_refguid,
+ (u_longlong_t)drrwbr->drr_refobject,
+ (u_longlong_t)drrwbr->drr_refoffset);
+ }
+ break;
+
+ case DRR_FREE:
+ if (do_byteswap) {
+ drrf->drr_object = BSWAP_64(drrf->drr_object);
+ drrf->drr_offset = BSWAP_64(drrf->drr_offset);
+ drrf->drr_length = BSWAP_64(drrf->drr_length);
+ }
+ if (verbose) {
+ (void) printf("FREE object = %llu "
+ "offset = %llu length = %lld\n",
+ (u_longlong_t)drrf->drr_object,
+ (u_longlong_t)drrf->drr_offset,
+ (longlong_t)drrf->drr_length);
+ }
+ break;
+ case DRR_SPILL:
+ if (do_byteswap) {
+ drrs->drr_object = BSWAP_64(drrs->drr_object);
+ drrs->drr_length = BSWAP_64(drrs->drr_length);
+ }
+ if (verbose) {
+ (void) printf("SPILL block for object = %llu "
+ "length = %llu\n", drrs->drr_object,
+ drrs->drr_length);
+ }
+ (void) ssread(buf, drrs->drr_length, &zc);
+ break;
+ }
+ pcksum = zc;
+ }
+ free(buf);
+
+ /* Print final summary */
+
+ (void) printf("SUMMARY:\n");
+ (void) printf("\tTotal DRR_BEGIN records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_BEGIN]);
+ (void) printf("\tTotal DRR_END records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_END]);
+ (void) printf("\tTotal DRR_OBJECT records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_OBJECT]);
+ (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
+ (void) printf("\tTotal DRR_WRITE records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_WRITE]);
+ (void) printf("\tTotal DRR_FREE records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_FREE]);
+ (void) printf("\tTotal DRR_SPILL records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_SPILL]);
+ (void) printf("\tTotal records = %lld\n",
+ (u_longlong_t)(drr_record_count[DRR_BEGIN] +
+ drr_record_count[DRR_OBJECT] +
+ drr_record_count[DRR_FREEOBJECTS] +
+ drr_record_count[DRR_WRITE] +
+ drr_record_count[DRR_FREE] +
+ drr_record_count[DRR_SPILL] +
+ drr_record_count[DRR_END]));
+ (void) printf("\tTotal write size = %lld (0x%llx)\n",
+ (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
+ (void) printf("\tTotal stream length = %lld (0x%llx)\n",
+ (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
+ return (0);
+}
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index 3894f6baa543..d3502309eab5 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -86,14 +85,16 @@
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/zil.h>
+#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
#include <sys/refcount.h>
#include <stdio.h>
#include <stdio_ext.h>
@@ -106,6 +107,7 @@
#include <math.h>
#include <errno.h>
#include <sys/fs/zfs.h>
+#include <libnvpair.h>
static char cmdname[] = "ztest";
static char *zopt_pool = cmdname;
@@ -126,144 +128,231 @@ static int zopt_verbose = 0;
static int zopt_init = 1;
static char *zopt_dir = "/tmp";
static uint64_t zopt_time = 300; /* 5 minutes */
-static int zopt_maxfaults;
+static uint64_t zopt_maxloops = 50; /* max loops during spa_freeze() */
+
+#define BT_MAGIC 0x123456789abcdefULL
+#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+ ZTEST_IO_WRITE_TAG,
+ ZTEST_IO_WRITE_PATTERN,
+ ZTEST_IO_WRITE_ZEROES,
+ ZTEST_IO_TRUNCATE,
+ ZTEST_IO_SETATTR,
+ ZTEST_IO_TYPES
+};
typedef struct ztest_block_tag {
+ uint64_t bt_magic;
uint64_t bt_objset;
uint64_t bt_object;
uint64_t bt_offset;
+ uint64_t bt_gen;
uint64_t bt_txg;
- uint64_t bt_thread;
- uint64_t bt_seq;
+ uint64_t bt_crtxg;
} ztest_block_tag_t;
-typedef struct ztest_args {
- char za_pool[MAXNAMELEN];
- spa_t *za_spa;
- objset_t *za_os;
- zilog_t *za_zilog;
- thread_t za_thread;
- uint64_t za_instance;
- uint64_t za_random;
- uint64_t za_diroff;
- uint64_t za_diroff_shared;
- uint64_t za_zil_seq;
- hrtime_t za_start;
- hrtime_t za_stop;
- hrtime_t za_kill;
- /*
- * Thread-local variables can go here to aid debugging.
- */
- ztest_block_tag_t za_rbt;
- ztest_block_tag_t za_wbt;
- dmu_object_info_t za_doi;
- dmu_buf_t *za_dbuf;
-} ztest_args_t;
-
-typedef void ztest_func_t(ztest_args_t *);
+typedef struct bufwad {
+ uint64_t bw_index;
+ uint64_t bw_txg;
+ uint64_t bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+ void *rll_writer;
+ int rll_readers;
+ mutex_t rll_lock;
+ cond_t rll_cv;
+} rll_t;
+
+typedef struct rl {
+ uint64_t rl_object;
+ uint64_t rl_offset;
+ uint64_t rl_size;
+ rll_t *rl_lock;
+} rl_t;
+
+#define ZTEST_RANGE_LOCKS 64
+#define ZTEST_OBJECT_LOCKS 64
+
+/*
+ * Object descriptor. Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+ uint64_t od_dir;
+ uint64_t od_object;
+ dmu_object_type_t od_type;
+ dmu_object_type_t od_crtype;
+ uint64_t od_blocksize;
+ uint64_t od_crblocksize;
+ uint64_t od_gen;
+ uint64_t od_crgen;
+ char od_name[MAXNAMELEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+ objset_t *zd_os;
+ zilog_t *zd_zilog;
+ uint64_t zd_seq;
+ ztest_od_t *zd_od; /* debugging aid */
+ char zd_name[MAXNAMELEN];
+ mutex_t zd_dirobj_lock;
+ rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
+ rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+ ztest_func_t *zi_func; /* test function */
+ uint64_t zi_iters; /* iterations per execution */
+ uint64_t *zi_interval; /* execute every <interval> seconds */
+ uint64_t zi_call_count; /* per-pass count */
+ uint64_t zi_call_time; /* per-pass time */
+ uint64_t zi_call_next; /* next time to call this function */
+} ztest_info_t;
/*
* Note: these aren't static because we want dladdr() to work.
*/
ztest_func_t ztest_dmu_read_write;
-ztest_func_t ztest_dmu_read_write_zcopy;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
-ztest_func_t ztest_fzap;
ztest_func_t ztest_zap_parallel;
-ztest_func_t ztest_traverse;
-ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_dmu_read_write_zcopy;
ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
ztest_func_t ztest_dmu_snapshot_create_destroy;
-ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
ztest_func_t ztest_spa_create_destroy;
ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
ztest_func_t ztest_vdev_attach_detach;
ztest_func_t ztest_vdev_LUN_growth;
ztest_func_t ztest_vdev_add_remove;
ztest_func_t ztest_vdev_aux_add_remove;
-ztest_func_t ztest_scrub;
+ztest_func_t ztest_split_pool;
-typedef struct ztest_info {
- ztest_func_t *zi_func; /* test function */
- uint64_t zi_iters; /* iterations per execution */
- uint64_t *zi_interval; /* execute every <interval> seconds */
- uint64_t zi_calls; /* per-pass count */
- uint64_t zi_call_time; /* per-pass time */
- uint64_t zi_call_total; /* cumulative total */
- uint64_t zi_call_target; /* target cumulative total */
-} ztest_info_t;
-
-uint64_t zopt_always = 0; /* all the time */
-uint64_t zopt_often = 1; /* every second */
-uint64_t zopt_sometimes = 10; /* every 10 seconds */
-uint64_t zopt_rarely = 60; /* every 60 seconds */
+uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always },
- { ztest_dmu_read_write_zcopy, 1, &zopt_always },
- { ztest_dmu_write_parallel, 30, &zopt_always },
+ { ztest_dmu_write_parallel, 10, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
+ { ztest_dmu_commit_callbacks, 1, &zopt_always },
{ ztest_zap, 30, &zopt_always },
- { ztest_fzap, 30, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
- { ztest_dsl_prop_get_set, 1, &zopt_sometimes },
- { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
- { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
- { ztest_spa_create_destroy, 1, &zopt_sometimes },
+ { ztest_split_pool, 1, &zopt_always },
+ { ztest_zil_commit, 1, &zopt_incessant },
+ { ztest_dmu_read_write_zcopy, 1, &zopt_often },
+ { ztest_dmu_objset_create_destroy, 1, &zopt_often },
+ { ztest_dsl_prop_get_set, 1, &zopt_often },
+ { ztest_spa_prop_get_set, 1, &zopt_sometimes },
+#if 0
+ { ztest_dmu_prealloc, 1, &zopt_sometimes },
+#endif
+ { ztest_fzap, 1, &zopt_sometimes },
+ { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
+ { ztest_spa_create_destroy, 1, &zopt_sometimes },
{ ztest_fault_inject, 1, &zopt_sometimes },
+ { ztest_ddt_repair, 1, &zopt_sometimes },
+ { ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_spa_rename, 1, &zopt_rarely },
- { ztest_vdev_attach_detach, 1, &zopt_rarely },
- { ztest_vdev_LUN_growth, 1, &zopt_rarely },
+ { ztest_scrub, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
- { ztest_vdev_add_remove, 1, &zopt_vdevtime },
+ { ztest_vdev_attach_detach, 1, &zopt_rarely },
+ { ztest_vdev_LUN_growth, 1, &zopt_rarely },
+ { ztest_vdev_add_remove, 1, &zopt_vdevtime },
{ ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
- { ztest_scrub, 1, &zopt_vdevtime },
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
-#define ZTEST_SYNC_LOCKS 16
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+ mutex_t zcl_callbacks_lock;
+ list_t zcl_callbacks;
+} ztest_cb_list_t;
/*
* Stuff we need to share writably between parent and child.
*/
typedef struct ztest_shared {
- mutex_t zs_vdev_lock;
- rwlock_t zs_name_lock;
- uint64_t zs_vdev_primaries;
- uint64_t zs_vdev_aux;
+ char *zs_pool;
+ spa_t *zs_spa;
+ hrtime_t zs_proc_start;
+ hrtime_t zs_proc_stop;
+ hrtime_t zs_thread_start;
+ hrtime_t zs_thread_stop;
+ hrtime_t zs_thread_kill;
uint64_t zs_enospc_count;
- hrtime_t zs_start_time;
- hrtime_t zs_stop_time;
+ uint64_t zs_vdev_next_leaf;
+ uint64_t zs_vdev_aux;
uint64_t zs_alloc;
uint64_t zs_space;
+ mutex_t zs_vdev_lock;
+ rwlock_t zs_name_lock;
ztest_info_t zs_info[ZTEST_FUNCS];
- mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS];
- uint64_t zs_seq[ZTEST_SYNC_LOCKS];
+ uint64_t zs_splits;
+ uint64_t zs_mirrors;
+ ztest_ds_t zs_zd[];
} ztest_shared_t;
+#define ID_PARALLEL -1ULL
+
static char ztest_dev_template[] = "%s/%s.%llua";
static char ztest_aux_template[] = "%s/%s.%s.%llu";
-static ztest_shared_t *ztest_shared;
+ztest_shared_t *ztest_shared;
+uint64_t *ztest_seq;
static int ztest_random_fd;
static int ztest_dump_core = 1;
-static uint64_t metaslab_sz;
static boolean_t ztest_exiting;
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
+
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
+static uint64_t metaslab_sz;
-#define ZTEST_DIROBJ 1
-#define ZTEST_MICROZAP_OBJ 2
-#define ZTEST_FATZAP_OBJ 3
-
-#define ZTEST_DIROBJ_BLOCKSIZE (1 << 10)
-#define ZTEST_DIRSIZE 256
+enum ztest_object {
+ ZTEST_META_DNODE = 0,
+ ZTEST_DIROBJ,
+ ZTEST_OBJECTS
+};
static void usage(boolean_t) __NORETURN;
@@ -381,21 +470,22 @@ usage(boolean_t requested)
(void) fprintf(fp, "Usage: %s\n"
"\t[-v vdevs (default: %llu)]\n"
"\t[-s size_of_each_vdev (default: %s)]\n"
- "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
+ "\t[-a alignment_shift (default: %d)] use 0 for random\n"
"\t[-m mirror_copies (default: %d)]\n"
"\t[-r raidz_disks (default: %d)]\n"
"\t[-R raidz_parity (default: %d)]\n"
"\t[-d datasets (default: %d)]\n"
"\t[-t threads (default: %d)]\n"
"\t[-g gang_block_threshold (default: %s)]\n"
- "\t[-i initialize pool i times (default: %d)]\n"
- "\t[-k kill percentage (default: %llu%%)]\n"
+ "\t[-i init_count (default: %d)] initialize pool i times\n"
+ "\t[-k kill_percentage (default: %llu%%)]\n"
"\t[-p pool_name (default: %s)]\n"
- "\t[-f file directory for vdev files (default: %s)]\n"
- "\t[-V(erbose)] (use multiple times for ever more blather)\n"
- "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
- "\t[-T time] total run time (default: %llu sec)\n"
- "\t[-P passtime] time per pass (default: %llu sec)\n"
+ "\t[-f dir (default: %s)] file directory for vdev files\n"
+ "\t[-V] verbose (use multiple times for ever more blather)\n"
+ "\t[-E] use existing pool instead of creating new one\n"
+ "\t[-T time (default: %llu sec)] total run time\n"
+ "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+ "\t[-P passtime (default: %llu sec)] time per pass\n"
"\t[-h] (print help)\n"
"",
cmdname,
@@ -413,31 +503,11 @@ usage(boolean_t requested)
zopt_pool, /* -p */
zopt_dir, /* -f */
(u_longlong_t)zopt_time, /* -T */
+ (u_longlong_t)zopt_maxloops, /* -F */
(u_longlong_t)zopt_passtime); /* -P */
exit(requested ? 0 : 1);
}
-static uint64_t
-ztest_random(uint64_t range)
-{
- uint64_t r;
-
- if (range == 0)
- return (0);
-
- if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
- fatal(1, "short read from /dev/urandom");
-
- return (r % range);
-}
-
-/* ARGSUSED */
-static void
-ztest_record_enospc(char *s)
-{
- ztest_shared->zs_enospc_count++;
-}
-
static void
process_options(int argc, char **argv)
{
@@ -451,7 +521,7 @@ process_options(int argc, char **argv)
metaslab_gang_bang = 32 << 10;
while ((opt = getopt(argc, argv,
- "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) {
+ "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) {
value = 0;
switch (opt) {
case 'v':
@@ -467,6 +537,7 @@ process_options(int argc, char **argv)
case 'k':
case 'T':
case 'P':
+ case 'F':
value = nicenumtoull(optarg);
}
switch (opt) {
@@ -486,7 +557,7 @@ process_options(int argc, char **argv)
zopt_raidz = MAX(1, value);
break;
case 'R':
- zopt_raidz_parity = MIN(MAX(value, 1), 2);
+ zopt_raidz_parity = MIN(MAX(value, 1), 3);
break;
case 'd':
zopt_datasets = MAX(1, value);
@@ -521,6 +592,9 @@ process_options(int argc, char **argv)
case 'P':
zopt_passtime = MAX(1, value);
break;
+ case 'F':
+ zopt_maxloops = MAX(1, value);
+ break;
case 'h':
usage(B_TRUE);
break;
@@ -533,8 +607,37 @@ process_options(int argc, char **argv)
zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
- zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
- zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
+ zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+ UINT64_MAX >> 2);
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+ (void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+ uint64_t r;
+
+ if (range == 0)
+ return (0);
+
+ if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+ fatal(1, "short read from /dev/urandom");
+
+ return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+ ztest_shared->zs_enospc_count++;
}
static uint64_t
@@ -563,7 +666,7 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
(void) sprintf(path, ztest_aux_template,
zopt_dir, zopt_pool, aux, vdev);
} else {
- vdev = ztest_shared->zs_vdev_primaries++;
+ vdev = ztest_shared->zs_vdev_next_leaf++;
(void) sprintf(path, ztest_dev_template,
zopt_dir, zopt_pool, vdev);
}
@@ -674,100 +777,807 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
return (root);
}
+static int
+ztest_random_blocksize(void)
+{
+ return (1 << (SPA_MINBLOCKSHIFT +
+ ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+}
+
+static int
+ztest_random_ibshift(void)
+{
+ return (DN_MIN_INDBLKSHIFT +
+ ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
+{
+ uint64_t top;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ do {
+ top = ztest_random(rvd->vdev_children);
+ tvd = rvd->vdev_child[top];
+ } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+ tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+ return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+ uint64_t value;
+
+ do {
+ value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+ } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+ return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+ boolean_t inherit)
+{
+ const char *propname = zfs_prop_to_name(prop);
+ const char *valname;
+ char setpoint[MAXPATHLEN];
+ uint64_t curval;
+ int error;
+
+ error = dsl_prop_set(osname, propname,
+ (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
+ sizeof (value), 1, &value);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT3U(error, ==, 0);
+
+ VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
+ 1, &curval, setpoint), ==, 0);
+
+ if (zopt_verbose >= 6) {
+ VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
+ (void) printf("%s %s = %s at '%s'\n",
+ osname, propname, valname, setpoint);
+ }
+
+ return (error);
+}
+
+static int
+ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
+{
+ spa_t *spa = zs->zs_spa;
+ nvlist_t *props = NULL;
+ int error;
+
+ VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+ error = spa_prop_set(spa, props);
+
+ nvlist_free(props);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT3U(error, ==, 0);
+
+ return (error);
+}
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+ rll->rll_writer = NULL;
+ rll->rll_readers = 0;
+ VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+}
+
static void
-ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+ztest_rll_destroy(rll_t *rll)
+{
+ ASSERT(rll->rll_writer == NULL);
+ ASSERT(rll->rll_readers == 0);
+ VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+ VERIFY(cond_destroy(&rll->rll_cv) == 0);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+ if (type == RL_READER) {
+ while (rll->rll_writer != NULL)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_readers++;
+ } else {
+ while (rll->rll_writer != NULL || rll->rll_readers)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_writer = curthread;
+ }
+
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+ if (rll->rll_writer) {
+ ASSERT(rll->rll_readers == 0);
+ rll->rll_writer = NULL;
+ } else {
+ ASSERT(rll->rll_readers != 0);
+ ASSERT(rll->rll_writer == NULL);
+ rll->rll_readers--;
+ }
+
+ if (rll->rll_writer == NULL && rll->rll_readers == 0)
+ VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+ uint64_t size, rl_type_t type)
+{
+ uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ rl_t *rl;
+
+ rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ rl->rl_object = object;
+ rl->rl_offset = offset;
+ rl->rl_size = size;
+ rl->rl_lock = rll;
+
+ ztest_rll_lock(rll, type);
+
+ return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+ rll_t *rll = rl->rl_lock;
+
+ ztest_rll_unlock(rll);
+
+ umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+{
+ zd->zd_os = os;
+ zd->zd_zilog = dmu_objset_zil(os);
+ zd->zd_seq = 0;
+ dmu_objset_name(os, zd->zd_name);
+
+ VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
+
+ for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_init(&zd->zd_object_lock[l]);
+
+ for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+ VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
+
+ for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+ for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
{
- int bs = SPA_MINBLOCKSHIFT +
- ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
- int ibs = DN_MIN_INDBLKSHIFT +
- ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
+ uint64_t txg;
int error;
- error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
+ /*
+ * Attempt to assign tx to some transaction group.
+ */
+ error = dmu_tx_assign(tx, txg_how);
if (error) {
- char osname[300];
- dmu_objset_name(os, osname);
- fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
- osname, object, 1 << bs, ibs, error);
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ } else {
+ ASSERT3U(error, ==, ENOSPC);
+ ztest_record_enospc(tag);
+ }
+ dmu_tx_abort(tx);
+ return (0);
}
+ txg = dmu_tx_get_txg(tx);
+ ASSERT(txg != 0);
+ return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+ while (ip < ip_end)
+ *ip++ = value;
}
-static uint8_t
-ztest_random_checksum(void)
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
{
- uint8_t checksum;
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+ uint64_t diff = 0;
- do {
- checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
- } while (zio_checksum_table[checksum].ci_zbt);
+ while (ip < ip_end)
+ diff |= (value - *ip++);
+
+ return (diff == 0);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+ bt->bt_magic = BT_MAGIC;
+ bt->bt_objset = dmu_objset_id(os);
+ bt->bt_object = object;
+ bt->bt_offset = offset;
+ bt->bt_gen = gen;
+ bt->bt_txg = txg;
+ bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+ ASSERT(bt->bt_magic == BT_MAGIC);
+ ASSERT(bt->bt_objset == dmu_objset_id(os));
+ ASSERT(bt->bt_object == object);
+ ASSERT(bt->bt_offset == offset);
+ ASSERT(bt->bt_gen <= gen);
+ ASSERT(bt->bt_txg <= txg);
+ ASSERT(bt->bt_crtxg == crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+ dmu_object_info_t doi;
+ ztest_block_tag_t *bt;
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+ bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+ return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define lrz_type lr_mode
+#define lrz_blocksize lr_uid
+#define lrz_ibshift lr_gid
+#define lrz_bonustype lr_rdev
+#define lrz_bonuslen lr_crtime[1]
+
+static void
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
- if (checksum == ZIO_CHECKSUM_OFF)
- checksum = ZIO_CHECKSUM_ON;
+ itx->itx_oid = object;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+ itx_t *itx;
+ itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ if (lr->lr_length > ZIL_MAX_LOG_DATA)
+ write_state = WR_INDIRECT;
+
+ itx = zil_itx_create(TX_WRITE,
+ sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+ if (write_state == WR_COPIED &&
+ dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ write_state = WR_NEED_COPY;
+ }
+ itx->itx_private = zd;
+ itx->itx_wr_state = write_state;
+ itx->itx_sync = (ztest_random(8) == 0);
+ itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
- return (checksum);
+static void
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ itx->itx_sync = B_FALSE;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
}
-static uint8_t
-ztest_random_compress(void)
+static void
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
{
- return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ itx->itx_sync = B_FALSE;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
}
+/*
+ * ZIL replay ops
+ */
static int
-ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ ztest_block_tag_t *bbt;
+ dmu_buf_t *db;
dmu_tx_t *tx;
- int error;
+ uint64_t txg;
+ int error = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ }
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0)
+ return (ENOSPC);
+
+ ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = zap_create(os,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = zap_create_claim(os, lr->lr_foid,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ } else {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = dmu_object_alloc(os,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = dmu_object_claim(os, lr->lr_foid,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ }
+
if (error) {
- dmu_tx_abort(tx);
+ ASSERT3U(error, ==, EEXIST);
+ ASSERT(zd->zd_zilog->zl_replay);
+ dmu_tx_commit(tx);
return (error);
}
- error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
- DMU_OT_NONE, 0, tx);
- ASSERT3U(error, ==, 0);
+ ASSERT(lr->lr_foid != 0);
+
+ if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+ VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+ lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ bbt = ztest_bt_bonus(db);
+ dmu_buf_will_dirty(db, tx);
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+ &lr->lr_foid, tx));
+
+ (void) ztest_log_create(zd, tx, lr);
+
dmu_tx_commit(tx);
- if (zopt_verbose >= 5) {
- char osname[MAXNAMELEN];
- dmu_objset_name(os, osname);
- (void) printf("replay create of %s object %llu"
- " in txg %llu = %d\n",
- osname, (u_longlong_t)lr->lr_doid,
- (u_longlong_t)dmu_tx_get_txg(tx), error);
+ return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object, txg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ VERIFY3U(0, ==,
+ zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+ ASSERT(object != 0);
+
+ ztest_object_lock(zd, object, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_object_unlock(zd, object);
+ return (ENOSPC);
}
- return (error);
+ if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+ VERIFY3U(0, ==, zap_destroy(os, object, tx));
+ } else {
+ VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+ }
+
+ VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+ (void) ztest_log_remove(zd, tx, lr, object);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, object);
+
+ return (0);
}
static int
-ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
{
+ objset_t *os = zd->zd_os;
+ void *data = lr + 1; /* data follows lr */
+ uint64_t offset, length;
+ ztest_block_tag_t *bt = data;
+ ztest_block_tag_t *bbt;
+ uint64_t gen, txg, lrtxg, crtxg;
+ dmu_object_info_t doi;
dmu_tx_t *tx;
- int error;
+ dmu_buf_t *db;
+ arc_buf_t *abuf = NULL;
+ rl_t *rl;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+ byteswap_uint64_array(bt, sizeof (*bt));
+
+ if (bt->bt_magic != BT_MAGIC)
+ bt = NULL;
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ dmu_object_info_from_db(db, &doi);
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ gen = bbt->bt_gen;
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
+
+ dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+ if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+ P2PHASE(offset, length) == 0)
+ abuf = dmu_request_arcbuf(db, length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ dmu_buf_rele(db, FTAG);
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ if (bt != NULL) {
+ /*
+ * Usually, verify the old data before writing new data --
+ * but not always, because we also want to verify correct
+ * behavior when the data was not recently read into cache.
+ */
+ ASSERT(offset % doi.doi_data_block_size == 0);
+ if (ztest_random(4) != 0) {
+ int prefetch = ztest_random(2) ?
+ DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+ ztest_block_tag_t rbt;
+
+ VERIFY(dmu_read(os, lr->lr_foid, offset,
+ sizeof (rbt), &rbt, prefetch) == 0);
+ if (rbt.bt_magic == BT_MAGIC) {
+ ztest_bt_verify(&rbt, os, lr->lr_foid,
+ offset, gen, txg, crtxg);
+ }
+ }
+
+ /*
+ * Writes can appear to be newer than the bonus buffer because
+ * the ztest_get_data() callback does a dmu_read() of the
+ * open-context data, which may be different than the data
+ * as it was when the write was generated.
+ */
+ if (zd->zd_zilog->zl_replay) {
+ ztest_bt_verify(bt, os, lr->lr_foid, offset,
+ MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+ bt->bt_crtxg);
+ }
+
+ /*
+ * Set the bt's gen/txg to the bonus buffer's gen/txg
+ * so that all of the usual ASSERTs will work.
+ */
+ ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
}
- error = dmu_object_free(os, lr->lr_doid, tx);
+ if (abuf == NULL) {
+ dmu_write(os, lr->lr_foid, offset, length, data, tx);
+ } else {
+ bcopy(data, abuf->b_data, length);
+ dmu_assign_arcbuf(db, offset, abuf, tx);
+ }
+
+ (void) ztest_log_write(zd, tx, lr);
+
+ dmu_buf_rele(db, FTAG);
+
dmu_tx_commit(tx);
- return (error);
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+ lr->lr_length, tx) == 0);
+
+ (void) ztest_log_truncate(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ uint64_t txg, lrtxg, crtxg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ if (zd->zd_zilog->zl_replay) {
+ ASSERT(lr->lr_size != 0);
+ ASSERT(lr->lr_mode != 0);
+ ASSERT(lrtxg != 0);
+ } else {
+ /*
+ * Randomly change the size and increment the generation.
+ */
+ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+ sizeof (*bbt);
+ lr->lr_mode = bbt->bt_gen + 1;
+ ASSERT(lrtxg == 0);
+ }
+
+ /*
+ * Verify that the current bonus buffer is not newer than our txg.
+ */
+ ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+ MAX(txg, lrtxg), crtxg);
+
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+ ASSERT3U(lr->lr_size, <=, db->db_size);
+ VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+ bbt = ztest_bt_bonus(db);
+
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+ dmu_buf_rele(db, FTAG);
+
+ (void) ztest_log_setattr(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
}
zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
@@ -780,9 +1590,9 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
- NULL, /* TX_WRITE */
- NULL, /* TX_TRUNCATE */
- NULL, /* TX_SETATTR */
+ ztest_replay_write, /* TX_WRITE */
+ ztest_replay_truncate, /* TX_TRUNCATE */
+ ztest_replay_setattr, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
@@ -794,13 +1604,477 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
};
/*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+ ztest_ds_t *zd = zgd->zgd_private;
+ uint64_t object = zgd->zgd_rl->rl_object;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ ztest_range_unlock(zgd->zgd_rl);
+ ztest_object_unlock(zd, object);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+ umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ ztest_ds_t *zd = arg;
+ objset_t *os = zd->zd_os;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
+ uint64_t txg = lr->lr_common.lrc_txg;
+ uint64_t crtxg;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ztest_object_lock(zd, object, RL_READER);
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error) {
+ ztest_object_unlock(zd, object);
+ return (error);
+ }
+
+ crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+ if (crtxg == 0 || crtxg > txg) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, object);
+ return (ENOENT);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ dmu_buf_rele(db, FTAG);
+ db = NULL;
+
+ zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+ zgd->zgd_zilog = zd->zd_zilog;
+ zgd->zgd_private = zd;
+
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ ASSERT(error == 0);
+ } else {
+ size = doi.doi_data_block_size;
+ if (ISP2(size)) {
+ offset = P2ALIGN(offset, size);
+ } else {
+ ASSERT(offset < size);
+ offset = 0;
+ }
+
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ ztest_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ ztest_get_done(zgd, error);
+
+ return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+ char *lr;
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+ if (name)
+ bcopy(name, lr + lrsize, namesize);
+
+ return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects. Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ for (int i = 0; i < count; i++, od++) {
+ od->od_object = 0;
+ error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+ sizeof (uint64_t), 1, &od->od_object);
+ if (error) {
+ ASSERT(error == ENOENT);
+ ASSERT(od->od_object == 0);
+ missing++;
+ } else {
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ dmu_object_info_t doi;
+
+ ASSERT(od->od_object != 0);
+ ASSERT(missing == 0); /* there should be no gaps */
+
+ ztest_object_lock(zd, od->od_object, RL_READER);
+ VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+ od->od_object, FTAG, &db));
+ dmu_object_info_from_db(db, &doi);
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ od->od_type = doi.doi_type;
+ od->od_blocksize = doi.doi_data_block_size;
+ od->od_gen = bbt->bt_gen;
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, od->od_object);
+ }
+ }
+
+ return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ for (int i = 0; i < count; i++, od++) {
+ if (missing) {
+ od->od_object = 0;
+ missing++;
+ continue;
+ }
+
+ lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+ lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
+ lr->lrz_type = od->od_crtype;
+ lr->lrz_blocksize = od->od_crblocksize;
+ lr->lrz_ibshift = ztest_random_ibshift();
+ lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+ lr->lrz_bonuslen = dmu_bonus_max();
+ lr->lr_gen = od->od_crgen;
+ lr->lr_crtime[0] = time(NULL);
+
+ if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+ ASSERT(missing == 0);
+ od->od_object = 0;
+ missing++;
+ } else {
+ od->od_object = lr->lr_foid;
+ od->od_type = od->od_crtype;
+ od->od_blocksize = od->od_crblocksize;
+ od->od_gen = od->od_crgen;
+ ASSERT(od->od_object != 0);
+ }
+
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ od += count - 1;
+
+ for (int i = count - 1; i >= 0; i--, od--) {
+ if (missing) {
+ missing++;
+ continue;
+ }
+
+ if (od->od_object == 0)
+ continue;
+
+ lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+
+ if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+ ASSERT3U(error, ==, ENOSPC);
+ missing++;
+ } else {
+ od->od_object = 0;
+ }
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+ void *data)
+{
+ lr_write_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ bcopy(data, lr + 1, size);
+
+ error = ztest_replay_write(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+ return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ lr_truncate_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+
+ error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+ lr_setattr_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_size = 0;
+ lr->lr_mode = 0;
+
+ error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ ztest_object_lock(zd, object, RL_READER);
+ rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, object, offset, size);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+ if (txg != 0) {
+ dmu_prealloc(os, object, offset, size, tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(dmu_objset_pool(os), txg);
+ } else {
+ (void) dmu_free_long_range(os, object, offset, size);
+ }
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+ ztest_block_tag_t wbt;
+ dmu_object_info_t doi;
+ enum ztest_io_type io_type;
+ uint64_t blocksize;
+ void *data;
+
+ VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+ blocksize = doi.doi_data_block_size;
+ data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+ /*
+ * Pick an i/o type at random, biased toward writing block tags.
+ */
+ io_type = ztest_random(ZTEST_IO_TYPES);
+ if (ztest_random(2) == 0)
+ io_type = ZTEST_IO_WRITE_TAG;
+
+ switch (io_type) {
+
+ case ZTEST_IO_WRITE_TAG:
+ ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+ (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+ break;
+
+ case ZTEST_IO_WRITE_PATTERN:
+ (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+ if (ztest_random(2) == 0) {
+ /*
+ * Induce fletcher2 collisions to ensure that
+ * zio_ddt_collision() detects and resolves them
+ * when using fletcher2-verify for deduplication.
+ */
+ ((uint64_t *)data)[0] ^= 1ULL << 63;
+ ((uint64_t *)data)[4] ^= 1ULL << 63;
+ }
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_WRITE_ZEROES:
+ bzero(data, blocksize);
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_TRUNCATE:
+ (void) ztest_truncate(zd, object, offset, blocksize);
+ break;
+
+ case ZTEST_IO_SETATTR:
+ (void) ztest_setattr(zd, object);
+ break;
+ }
+
+ umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+ dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+ od->od_dir = ZTEST_DIROBJ;
+ od->od_object = 0;
+
+ od->od_crtype = type;
+ od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ od->od_crgen = gen;
+
+ od->od_type = DMU_OT_NONE;
+ od->od_blocksize = 0;
+ od->od_gen = 0;
+
+ (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+ tag, (int64_t)id, index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones. Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+ int count = size / sizeof (*od);
+ int rv = 0;
+
+ VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+ if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+ (ztest_remove(zd, od, count) != 0 ||
+ ztest_create(zd, od, count) != 0))
+ rv = -1;
+ zd->zd_od = od;
+ VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+ return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+ zilog_t *zilog = zd->zd_zilog;
+
+ zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
+
+ /*
+ * Remember the committed values in zd, which is in parent/child
+ * shared memory. If we die, the next iteration of ztest_run()
+ * will verify that the log really does contain this record.
+ */
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+ zd->zd_seq = zilog->zl_commit_lr_seq;
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
* Verify that we can't destroy an active pool, create an existing pool,
* or create a pool with a bad vdev spec.
*/
+/* ARGSUSED */
void
-ztest_spa_create_destroy(ztest_args_t *za)
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
{
- int error;
+ ztest_shared_t *zs = ztest_shared;
spa_t *spa;
nvlist_t *nvroot;
@@ -808,41 +2082,31 @@ ztest_spa_create_destroy(ztest_args_t *za)
* Attempt to create using a bad file.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_file) = %d", error);
/*
* Attempt to create using a bad mirror.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
- error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_mirror) = %d", error);
/*
* Attempt to create an existing pool. It shouldn't matter
* what's in the nvroot; we should fail with EEXIST.
*/
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ (void) rw_rdlock(&zs->zs_name_lock);
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
+ VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != EEXIST)
- fatal(0, "spa_create(whatever) = %d", error);
-
- error = spa_open(za->za_pool, &spa, FTAG);
- if (error)
- fatal(0, "spa_open() = %d", error);
-
- error = spa_destroy(za->za_pool);
- if (error != EBUSY)
- fatal(0, "spa_destroy() = %d", error);
-
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
spa_close(spa, FTAG);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
static vdev_t *
@@ -862,49 +2126,101 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
}
/*
+ * Find the first available hole which can be used as a top-level.
+ */
+int
+find_vdev_hole(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+
+ if (cvd->vdev_ishole)
+ break;
+ }
+ return (c);
+}
+
+/*
* Verify that vdev_add() works as expected.
*/
+/* ARGSUSED */
void
-ztest_vdev_add_remove(ztest_args_t *za)
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
- uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ uint64_t leaves;
+ uint64_t guid;
nvlist_t *nvroot;
int error;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- ztest_shared->zs_vdev_primaries =
- spa->spa_root_vdev->vdev_children * leaves;
-
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
/*
- * Make 1/4 of the devices be log devices.
+ * If we have slogs then remove them 1/4 of the time.
*/
- nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
- ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+ if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ /*
+ * Grab the guid from the head of the log class rotor.
+ */
+ guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
- error = spa_vdev_add(spa, nvroot);
- nvlist_free(nvroot);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between removing a slog (dmu_objset_find)
+ * and destroying a dataset. Removing the slog will
+ * grab a reference on the dataset which may cause
+ * dmu_objset_destroy() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+
+ if (error && error != EEXIST)
+ fatal(0, "spa_vdev_remove() = %d", error);
+ } else {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * Make 1/4 of the devices be log devices.
+ */
+ nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+ ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+ }
- if (error == ENOSPC)
- ztest_record_enospc("spa_vdev_add");
- else if (error != 0)
- fatal(0, "spa_vdev_add() = %d", error);
+ VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
}
/*
* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
*/
+/* ARGSUSED */
void
-ztest_vdev_aux_add_remove(ztest_args_t *za)
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
vdev_t *rvd = spa->spa_root_vdev;
spa_aux_vdev_t *sav;
char *aux;
@@ -919,7 +2235,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
aux = ZPOOL_CONFIG_L2CACHE;
}
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -932,12 +2248,12 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
/*
* Find an unused device we can add.
*/
- ztest_shared->zs_vdev_aux = 0;
+ zs->zs_vdev_aux = 0;
for (;;) {
char path[MAXPATHLEN];
int c;
(void) sprintf(path, ztest_aux_template, zopt_dir,
- zopt_pool, aux, ztest_shared->zs_vdev_aux);
+ zopt_pool, aux, zs->zs_vdev_aux);
for (c = 0; c < sav->sav_count; c++)
if (strcmp(sav->sav_vdevs[c]->vdev_path,
path) == 0)
@@ -945,7 +2261,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
if (c == sav->sav_count &&
vdev_lookup_by_path(rvd, path) == NULL)
break;
- ztest_shared->zs_vdev_aux++;
+ zs->zs_vdev_aux++;
}
}
@@ -968,28 +2284,126 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
* of devices that have pending state changes.
*/
if (ztest_random(2) == 0)
- (void) vdev_online(spa, guid, B_FALSE, NULL);
+ (void) vdev_online(spa, guid, 0, NULL);
error = spa_vdev_remove(spa, guid, B_FALSE);
if (error != 0 && error != EBUSY)
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ nvlist_t *tree, **child, *config, *split, **schild;
+ uint_t c, children, schildren = 0, lastlogid = 0;
+ int error = 0;
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+ /* ensure we have a useable config; mirrors of raidz aren't supported */
+ if (zs->zs_mirrors < 3 || zopt_raidz > 1) {
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ return;
+ }
+
+ /* clean up the old pool, if any */
+ (void) spa_destroy("splitp");
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* generate a config from the existing config */
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+ &tree) == 0);
+ mutex_exit(&spa->spa_props_lock);
+
+ VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0);
+
+ schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+ for (c = 0; c < children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ nvlist_t **mchild;
+ uint_t mchildren;
+
+ if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+ VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+ 0) == 0);
+ VERIFY(nvlist_add_string(schild[schildren],
+ ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+ VERIFY(nvlist_add_uint64(schild[schildren],
+ ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+ if (lastlogid == 0)
+ lastlogid = schildren;
+ ++schildren;
+ continue;
+ }
+ lastlogid = 0;
+ VERIFY(nvlist_lookup_nvlist_array(child[c],
+ ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+ VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+ }
+
+ /* OK, create a config that can be used to split */
+ VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+ lastlogid != 0 ? lastlogid : schildren) == 0);
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+ for (c = 0; c < schildren; c++)
+ nvlist_free(schild[c]);
+ free(schild);
+ nvlist_free(split);
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ (void) rw_wrlock(&zs->zs_name_lock);
+ error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+ (void) rw_unlock(&zs->zs_name_lock);
+
+ nvlist_free(config);
+
+ if (error == 0) {
+ (void) printf("successful split - results:\n");
+ mutex_enter(&spa_namespace_lock);
+ show_pool_stats(spa);
+ show_pool_stats(spa_lookup("splitp"));
+ mutex_exit(&spa_namespace_lock);
+ ++zs->zs_splits;
+ --zs->zs_mirrors;
+ }
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
}
/*
* Verify that we can attach and detach devices.
*/
+/* ARGSUSED */
void
-ztest_vdev_attach_detach(ztest_args_t *za)
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
spa_aux_vdev_t *sav = &spa->spa_spares;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *pvd;
nvlist_t *root;
- uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t leaves;
uint64_t leaf, top;
uint64_t ashift = ztest_get_ashift();
uint64_t oldguid, pguid;
@@ -1001,7 +2415,8 @@ ztest_vdev_attach_detach(ztest_args_t *za)
int oldvd_is_log;
int error, expected_error;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -1013,7 +2428,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
/*
* Pick a random top-level vdev.
*/
- top = ztest_random(rvd->vdev_children);
+ top = ztest_random_vdev_top(spa, B_TRUE);
/*
* Pick a random leaf within it.
@@ -1024,9 +2439,9 @@ ztest_vdev_attach_detach(ztest_args_t *za)
* Locate this vdev.
*/
oldvd = rvd->vdev_child[top];
- if (zopt_mirrors >= 1) {
+ if (zs->zs_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
- ASSERT(oldvd->vdev_children >= zopt_mirrors);
+ ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
oldvd = oldvd->vdev_child[leaf / zopt_raidz];
}
if (zopt_raidz > 1) {
@@ -1046,7 +2461,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
}
oldguid = oldvd->vdev_guid;
- oldsize = vdev_get_rsize(oldvd);
+ oldsize = vdev_get_min_asize(oldvd);
oldvd_is_log = oldvd->vdev_top->vdev_islog;
(void) strcpy(oldpath, oldvd->vdev_path);
pvd = oldvd->vdev_parent;
@@ -1061,7 +2476,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
if (error != 0 && error != ENODEV && error != EBUSY &&
error != ENOTSUP)
fatal(0, "detach (%s) returned %d", oldpath, error);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
return;
}
@@ -1082,7 +2497,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
}
if (newvd) {
- newsize = vdev_get_rsize(newvd);
+ newsize = vdev_get_min_asize(newvd);
} else {
/*
* Make newsize a little bigger or smaller than oldsize.
@@ -1154,169 +2569,373 @@ ztest_vdev_attach_detach(ztest_args_t *za)
(longlong_t)newsize, replacing, error, expected_error);
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
}
/*
- * Verify that dynamic LUN growth works as expected.
+ * Callback function which expands the physical size of the vdev.
*/
-void
-ztest_vdev_LUN_growth(ztest_args_t *za)
+vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
{
- spa_t *spa = za->za_spa;
- char dev_name[MAXPATHLEN];
- uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
- uint64_t vdev;
+ spa_t *spa = vd->vdev_spa;
+ size_t *newsize = arg;
size_t fsize;
int fd;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+ return (vd);
+
+ fsize = lseek(fd, 0, SEEK_END);
+ (void) ftruncate(fd, *newsize);
+
+ if (zopt_verbose >= 6) {
+ (void) printf("%s grew from %lu to %lu bytes\n",
+ vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+ }
+ (void) close(fd);
+ return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *tvd = vd->vdev_top;
+ uint64_t guid = vd->vdev_guid;
+ uint64_t generation = spa->spa_config_generation + 1;
+ vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ /* Calling vdev_online will initialize the new metaslabs */
+ spa_config_exit(spa, SCL_STATE, spa);
+ error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
/*
- * Pick a random leaf vdev.
+ * If vdev_online returned an error or the underlying vdev_open
+ * failed then we abort the expand. The only way to know that
+ * vdev_open fails is by checking the returned newstate.
*/
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ if (error || newstate != VDEV_STATE_HEALTHY) {
+ if (zopt_verbose >= 5) {
+ (void) printf("Unable to expand vdev, state %llu, "
+ "error %d\n", (u_longlong_t)newstate, error);
+ }
+ return (vd);
+ }
+ ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+ /*
+ * Since we dropped the lock we need to ensure that we're
+ * still talking to the original vdev. It's possible this
+ * vdev may have been detached/replaced while we were
+ * trying to online it.
+ */
+ if (generation != spa->spa_config_generation) {
+ if (zopt_verbose >= 5) {
+ (void) printf("vdev configuration has changed, "
+ "guid %llu, state %llu, expected gen %llu, "
+ "got gen %llu\n",
+ (u_longlong_t)guid,
+ (u_longlong_t)tvd->vdev_state,
+ (u_longlong_t)generation,
+ (u_longlong_t)spa->spa_config_generation);
+ }
+ return (vd);
+ }
+ return (NULL);
+}
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+ if (vd->vdev_ops->vdev_op_leaf) {
+ if (func == NULL)
+ return (vd);
+ else
+ return (func(vd, arg));
+ }
- if ((fd = open(dev_name, O_RDWR)) != -1) {
- /*
- * Determine the size.
- */
- fsize = lseek(fd, 0, SEEK_END);
+ for (uint_t c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+ return (cvd);
+ }
+ return (NULL);
+}
- /*
- * If it's less than 2x the original size, grow by around 3%.
- */
- if (fsize < 2 * zopt_vdev_size) {
- size_t newsize = fsize + ztest_random(fsize / 32);
- (void) ftruncate(fd, newsize);
- if (zopt_verbose >= 6) {
- (void) printf("%s grew from %lu to %lu bytes\n",
- dev_name, (ulong_t)fsize, (ulong_t)newsize);
- }
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ vdev_t *vd, *tvd;
+ metaslab_class_t *mc;
+ metaslab_group_t *mg;
+ size_t psize, newsize;
+ uint64_t top;
+ uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ top = ztest_random_vdev_top(spa, B_TRUE);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ mg = tvd->vdev_mg;
+ mc = mg->mg_class;
+ old_ms_count = tvd->vdev_ms_count;
+ old_class_space = metaslab_class_get_space(mc);
+
+ /*
+ * Determine the size of the first leaf vdev associated with
+ * our top-level device.
+ */
+ vd = vdev_walk_tree(tvd, NULL, NULL);
+ ASSERT3P(vd, !=, NULL);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ psize = vd->vdev_psize;
+
+ /*
+ * We only try to expand the vdev if it's healthy, less than 4x its
+ * original size, and it has a valid psize.
+ */
+ if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+ psize == 0 || psize >= 4 * zopt_vdev_size) {
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ return;
+ }
+ ASSERT(psize > 0);
+ newsize = psize + psize / 8;
+ ASSERT3U(newsize, >, psize);
+
+ if (zopt_verbose >= 6) {
+ (void) printf("Expanding LUN %s from %lu to %lu\n",
+ vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+ }
+
+ /*
+ * Growing the vdev is a two step process:
+ * 1). expand the physical size (i.e. relabel)
+ * 2). online the vdev to create the new metaslabs
+ */
+ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+ vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+ tvd->vdev_state != VDEV_STATE_HEALTHY) {
+ if (zopt_verbose >= 5) {
+ (void) printf("Could not expand LUN because "
+ "the vdev configuration changed.\n");
}
- (void) close(fd);
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ return;
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ spa_config_exit(spa, SCL_STATE, spa);
+
+ /*
+ * Expanding the LUN will update the config asynchronously,
+ * thus we must wait for the async thread to complete any
+ * pending tasks before proceeding.
+ */
+ for (;;) {
+ boolean_t done;
+ mutex_enter(&spa->spa_async_lock);
+ done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+ mutex_exit(&spa->spa_async_lock);
+ if (done)
+ break;
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100);
+ }
+
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ new_ms_count = tvd->vdev_ms_count;
+ new_class_space = metaslab_class_get_space(mc);
+
+ if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+ if (zopt_verbose >= 5) {
+ (void) printf("Could not verify LUN expansion due to "
+ "intervening vdev offline or remove.\n");
+ }
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ return;
+ }
+
+ /*
+ * Make sure we were able to grow the vdev.
+ */
+ if (new_ms_count <= old_ms_count)
+ fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+ old_ms_count, new_ms_count);
+
+ /*
+ * Make sure we were able to grow the pool.
+ */
+ if (new_class_space <= old_class_space)
+ fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+ old_class_space, new_class_space);
+
+ if (zopt_verbose >= 5) {
+ char oldnumbuf[6], newnumbuf[6];
+
+ nicenum(old_class_space, oldnumbuf);
+ nicenum(new_class_space, newnumbuf);
+ (void) printf("%s grew from %s to %s\n",
+ spa->spa_name, oldnumbuf, newnumbuf);
+ }
+
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
}
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
/* ARGSUSED */
static void
-ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
{
/*
- * Create the directory object.
+ * Create the objects common to all ztest datasets.
*/
- VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
- DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
- DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
-
- VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
+ VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
- VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
- DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+static int
+ztest_dataset_create(char *dsname)
+{
+ uint64_t zilset = ztest_random(100);
+ int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
+ ztest_objset_create_cb, NULL);
+
+ if (err || zilset < 80)
+ return (err);
+
+ (void) printf("Setting dataset %s to sync always\n", dsname);
+ return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+ ZFS_SYNC_ALWAYS, B_FALSE));
}
+/* ARGSUSED */
static int
-ztest_destroy_cb(char *name, void *arg)
+ztest_objset_destroy_cb(const char *name, void *arg)
{
- ztest_args_t *za = arg;
objset_t *os;
- dmu_object_info_t *doi = &za->za_doi;
+ dmu_object_info_t doi;
int error;
/*
* Verify that the dataset contains a directory object.
*/
- error = dmu_objset_open(name, DMU_OST_OTHER,
- DS_MODE_USER | DS_MODE_READONLY, &os);
- ASSERT3U(error, ==, 0);
- error = dmu_object_info(os, ZTEST_DIROBJ, doi);
+ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+ error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
if (error != ENOENT) {
/* We could have crashed in the middle of destroying it */
ASSERT3U(error, ==, 0);
- ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
- ASSERT3S(doi->doi_physical_blks, >=, 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+ ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
}
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
/*
* Destroy the dataset.
*/
- error = dmu_objset_destroy(name);
- if (error) {
- (void) dmu_objset_open(name, DMU_OST_OTHER,
- DS_MODE_USER | DS_MODE_READONLY, &os);
- fatal(0, "dmu_objset_destroy(os=%p) = %d\n", &os, error);
- }
+ VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
return (0);
}
-/*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
-static uint64_t
-ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
{
- itx_t *itx;
- lr_create_t *lr;
- size_t namesize;
- char name[24];
-
- (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
- namesize = strlen(name) + 1;
-
- itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
- ztest_random(ZIL_MAX_BLKSZ));
- lr = (lr_create_t *)&itx->itx_lr;
- bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
- lr->lr_doid = object;
- lr->lr_foid = 0;
- lr->lr_mode = mode;
- lr->lr_uid = 0;
- lr->lr_gid = 0;
- lr->lr_gen = dmu_tx_get_txg(tx);
- lr->lr_crtime[0] = time(NULL);
- lr->lr_crtime[1] = 0;
- lr->lr_rdev = 0;
- bcopy(name, (char *)(lr + 1), namesize);
-
- return (zil_itx_assign(zilog, itx, tx));
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+ NULL, NULL, B_FALSE, B_FALSE, -1);
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (B_FALSE);
+ }
+ if (error != 0 && error != EEXIST)
+ fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+ return (B_TRUE);
}
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_destroy(snapname, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
void
-ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t zdtmp;
+ int iters;
int error;
objset_t *os, *os2;
- char name[100];
- int basemode, expected_error;
+ char name[MAXNAMELEN];
zilog_t *zilog;
- uint64_t seq;
- uint64_t objects;
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
- (u_longlong_t)za->za_instance);
+ (void) rw_rdlock(&zs->zs_name_lock);
- basemode = DS_MODE_TYPE(za->za_instance);
- if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
- basemode = DS_MODE_USER;
+ (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+ zs->zs_pool, (u_longlong_t)id);
/*
* If this dataset exists from a previous run, process its replay log
* half of the time. If we don't replay it, then dmu_objset_destroy()
- * (invoked from ztest_destroy_cb() below) should just throw it away.
+ * (invoked from ztest_objset_destroy_cb()) should just throw it away.
*/
if (ztest_random(2) == 0 &&
- dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
- zil_replay(os, os, ztest_replay_vector);
- dmu_objset_close(os);
+ dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
+ ztest_zd_init(&zdtmp, os);
+ zil_replay(os, &zdtmp, ztest_replay_vector);
+ ztest_zd_fini(&zdtmp);
+ dmu_objset_disown(os, FTAG);
}
/*
@@ -1324,170 +2943,152 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
* create lying around from a previous run. If so, destroy it
* and all of its snapshots.
*/
- (void) dmu_objset_find(name, ztest_destroy_cb, za,
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
/*
* Verify that the destroyed dataset is no longer in the namespace.
*/
- error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
- if (error != ENOENT)
- fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
- name, os);
+ VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
/*
* Verify that we can create a new dataset.
*/
- error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
- ztest_create_cb, NULL);
+ error = ztest_dataset_create(name);
if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_objset_create");
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ ztest_record_enospc(FTAG);
+ (void) rw_unlock(&zs->zs_name_lock);
return;
}
fatal(0, "dmu_objset_create(%s) = %d", name, error);
}
- error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
- if (error) {
- fatal(0, "dmu_objset_open(%s) = %d", name, error);
- }
+ VERIFY3U(0, ==,
+ dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+ ztest_zd_init(&zdtmp, os);
/*
* Open the intent log for it.
*/
- zilog = zil_open(os, NULL);
+ zilog = zil_open(os, ztest_get_data);
/*
- * Put a random number of objects in there.
+ * Put some objects in there, do a little I/O to them,
+ * and randomly take a couple of snapshots along the way.
*/
- objects = ztest_random(20);
- seq = 0;
- while (objects-- != 0) {
- uint64_t object;
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- seq = ztest_log_create(zilog, tx, object,
- DMU_OT_UINT64_OTHER);
- dmu_write(os, object, 0, sizeof (name), name, tx);
- dmu_tx_commit(tx);
- }
- if (ztest_random(5) == 0) {
- zil_commit(zilog, seq, object);
- }
- if (ztest_random(100) == 0) {
- error = zil_suspend(zilog);
- if (error == 0) {
- zil_resume(zilog);
- }
- }
+ iters = ztest_random(5);
+ for (int i = 0; i < iters; i++) {
+ ztest_dmu_object_alloc_free(&zdtmp, id);
+ if (ztest_random(iters) == 0)
+ (void) ztest_snapshot_create(name, i);
}
/*
* Verify that we cannot create an existing dataset.
*/
- error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL);
- if (error != EEXIST)
- fatal(0, "created existing dataset, error = %d", error);
+ VERIFY3U(EEXIST, ==,
+ dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
/*
- * Verify that multiple dataset holds are allowed, but only when
- * the new access mode is compatible with the base mode.
+ * Verify that we can hold an objset that is also owned.
*/
- if (basemode == DS_MODE_OWNER) {
- error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER,
- &os2);
- if (error)
- fatal(0, "dmu_objset_open('%s') = %d", name, error);
- else
- dmu_objset_close(os2);
- }
- error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os2);
- expected_error = (basemode == DS_MODE_OWNER) ? EBUSY : 0;
- if (error != expected_error)
- fatal(0, "dmu_objset_open('%s') = %d, expected %d",
- name, error, expected_error);
- if (error == 0)
- dmu_objset_close(os2);
+ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+ dmu_objset_rele(os2, FTAG);
- zil_close(zilog);
- dmu_objset_close(os);
+ /*
+ * Verify that we cannot own an objset that is already owned.
+ */
+ VERIFY3U(EBUSY, ==,
+ dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
- error = dmu_objset_destroy(name);
- if (error)
- fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
+ zil_close(zilog);
+ dmu_objset_disown(os, FTAG);
+ ztest_zd_fini(&zdtmp);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
*/
void
-ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+ (void) ztest_snapshot_destroy(zd->zd_name, id);
+ (void) ztest_snapshot_create(zd->zd_name, id);
+ (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/*
+ * Cleanup non-standard snapshots and clones.
+ */
+void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
{
+ char snap1name[MAXNAMELEN];
+ char clone1name[MAXNAMELEN];
+ char snap2name[MAXNAMELEN];
+ char clone2name[MAXNAMELEN];
+ char snap3name[MAXNAMELEN];
int error;
- objset_t *os = za->za_os;
- char snapname[100];
- char osname[MAXNAMELEN];
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- dmu_objset_name(os, osname);
- (void) snprintf(snapname, 100, "%s@%llu", osname,
- (u_longlong_t)za->za_instance);
+ (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+ (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+ (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+ (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+ (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
- error = dmu_objset_destroy(snapname);
- if (error != 0 && error != ENOENT)
- fatal(0, "dmu_objset_destroy() = %d", error);
- error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
- NULL, FALSE);
- if (error == ENOSPC)
- ztest_record_enospc("dmu_take_snapshot");
- else if (error != 0 && error != EEXIST)
- fatal(0, "dmu_take_snapshot() = %d", error);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ error = dmu_objset_destroy(clone2name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
+ error = dmu_objset_destroy(snap3name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
+ error = dmu_objset_destroy(snap2name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
+ error = dmu_objset_destroy(clone1name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
+ error = dmu_objset_destroy(snap1name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
}
/*
* Verify dsl_dataset_promote handles EBUSY
*/
void
-ztest_dsl_dataset_promote_busy(ztest_args_t *za)
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
{
- int error;
- objset_t *os = za->za_os;
+ ztest_shared_t *zs = ztest_shared;
objset_t *clone;
dsl_dataset_t *ds;
- char snap1name[100];
- char clone1name[100];
- char snap2name[100];
- char clone2name[100];
- char snap3name[100];
- char osname[MAXNAMELEN];
- static uint64_t uniq = 0;
- uint64_t curval;
+ char snap1name[MAXNAMELEN];
+ char clone1name[MAXNAMELEN];
+ char snap2name[MAXNAMELEN];
+ char clone2name[MAXNAMELEN];
+ char snap3name[MAXNAMELEN];
+ char *osname = zd->zd_name;
+ int error;
- curval = atomic_add_64_nv(&uniq, 5) - 5;
+ (void) rw_rdlock(&zs->zs_name_lock);
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ ztest_dsl_dataset_cleanup(osname, id);
- dmu_objset_name(os, osname);
- (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval++);
- (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval++);
- (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval++);
- (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval++);
- (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval++);
+ (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+ (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+ (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+ (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+ (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
- NULL, FALSE);
+ NULL, NULL, B_FALSE, B_FALSE, -1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
@@ -1496,14 +3097,12 @@ ztest_dsl_dataset_promote_busy(ztest_args_t *za)
fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
}
- error = dmu_objset_open(snap1name, DMU_OST_OTHER,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(snap1name, FTAG, &clone);
if (error)
fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
- error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
- NULL, NULL);
- dmu_objset_close(clone);
+ error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
+ dmu_objset_rele(clone, FTAG);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
@@ -1513,7 +3112,7 @@ ztest_dsl_dataset_promote_busy(ztest_args_t *za)
}
error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
- NULL, FALSE);
+ NULL, NULL, B_FALSE, B_FALSE, -1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
@@ -1523,7 +3122,7 @@ ztest_dsl_dataset_promote_busy(ztest_args_t *za)
}
error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
- NULL, FALSE);
+ NULL, NULL, B_FALSE, B_FALSE, -1);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc(FTAG);
@@ -1532,289 +3131,73 @@ ztest_dsl_dataset_promote_busy(ztest_args_t *za)
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
}
- error = dmu_objset_open(snap3name, DMU_OST_OTHER,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(snap3name, FTAG, &clone);
if (error)
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
- error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
- NULL, NULL);
- dmu_objset_close(clone);
+ error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
+ dmu_objset_rele(clone, FTAG);
if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_objset_create");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
}
- error = dsl_dataset_own(snap1name, 0, FTAG, &ds);
+ error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
if (error)
- fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
- error = dsl_dataset_promote(clone2name);
+ fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
+ error = dsl_dataset_promote(clone2name, NULL);
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error);
dsl_dataset_disown(ds, FTAG);
out:
- error = dmu_objset_destroy(clone2name);
- if (error && error != ENOENT)
- fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
-
- error = dmu_objset_destroy(snap3name);
- if (error && error != ENOENT)
- fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
+ ztest_dsl_dataset_cleanup(osname, id);
- error = dmu_objset_destroy(snap2name);
- if (error && error != ENOENT)
- fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
-
- error = dmu_objset_destroy(clone1name);
- if (error && error != ENOENT)
- fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
- error = dmu_objset_destroy(snap1name);
- if (error && error != ENOENT)
- fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
-
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Verify that dmu_object_{alloc,free} work as expected.
*/
void
-ztest_dmu_object_alloc_free(ztest_args_t *za)
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_buf_t *db;
- dmu_tx_t *tx;
- uint64_t batchobj, object, batchsize, endoff, temp;
- int b, c, error, bonuslen;
- dmu_object_info_t *doi = &za->za_doi;
- char osname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
+ ztest_od_t od[4];
+ int batchsize = sizeof (od) / sizeof (od[0]);
- endoff = -8ULL;
- batchsize = 2;
-
- /*
- * Create a batch object if necessary, and record it in the directory.
- */
- VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
- if (batchobj == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create a batch object");
- dmu_tx_abort(tx);
- return;
- }
- batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, batchobj, tx);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, tx);
- dmu_tx_commit(tx);
- }
-
- /*
- * Destroy the previous batch of objects.
- */
- for (b = 0; b < batchsize; b++) {
- VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
- if (object == 0)
- continue;
- /*
- * Read and validate contents.
- * We expect the nth byte of the bonus buffer to be n.
- */
- VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
- za->za_dbuf = db;
-
- dmu_object_info_from_db(db, doi);
- ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
- ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
- ASSERT3S(doi->doi_physical_blks, >=, 0);
-
- bonuslen = doi->doi_bonus_size;
-
- for (c = 0; c < bonuslen; c++) {
- if (((uint8_t *)db->db_data)[c] !=
- (uint8_t)(c + bonuslen)) {
- fatal(0,
- "bad bonus: %s, obj %llu, off %d: %u != %u",
- osname, object, c,
- ((uint8_t *)db->db_data)[c],
- (uint8_t)(c + bonuslen));
- }
- }
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * We expect the word at endoff to be our object number.
- */
- VERIFY(0 == dmu_read(os, object, endoff,
- sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-
- if (temp != object) {
- fatal(0, "bad data in %s, got %llu, expected %llu",
- osname, temp, object);
- }
-
- /*
- * Destroy old object and clear batch entry.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj,
- b * sizeof (uint64_t), sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("free object");
- dmu_tx_abort(tx);
- return;
- }
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- object = 0;
-
- dmu_object_set_checksum(os, batchobj,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, batchobj,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- dmu_tx_commit(tx);
- }
-
- /*
- * Before creating the new batch of objects, generate a bunch of churn.
- */
- for (b = ztest_random(100); b > 0; b--) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("churn objects");
- dmu_tx_abort(tx);
- return;
- }
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- dmu_tx_commit(tx);
- }
+ for (int b = 0; b < batchsize; b++)
+ ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
/*
- * Create a new batch of objects with randomly chosen
- * blocksizes and record them in the batch directory.
+ * Destroy the previous batch of objects, create a new batch,
+ * and do some I/O on the new objects.
*/
- for (b = 0; b < batchsize; b++) {
- uint32_t va_blksize;
- u_longlong_t va_nblocks;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
- sizeof (uint64_t));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create batchobj");
- dmu_tx_abort(tx);
- return;
- }
- bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_PLAIN_OTHER, bonuslen, tx);
-
- ztest_set_random_blocksize(os, object, tx);
-
- dmu_object_set_checksum(os, object,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, object,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- /*
- * Write to both the bonus buffer and the regular data.
- */
- VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
- za->za_dbuf = db;
- ASSERT3U(bonuslen, <=, db->db_size);
-
- dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
- ASSERT3S(va_nblocks, >=, 0);
-
- dmu_buf_will_dirty(db, tx);
-
- /*
- * See comments above regarding the contents of
- * the bonus buffer and the word at endoff.
- */
- for (c = 0; c < bonuslen; c++)
- ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * Write to a large offset to increase indirection.
- */
- dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+ if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+ return;
- dmu_tx_commit(tx);
- }
+ while (ztest_random(4 * batchsize) != 0)
+ ztest_io(zd, od[ztest_random(batchsize)].od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
}
/*
* Verify that dmu_{read,write} work as expected.
*/
-typedef struct bufwad {
- uint64_t bw_index;
- uint64_t bw_txg;
- uint64_t bw_data;
-} bufwad_t;
-
-typedef struct dmu_read_write_dir {
- uint64_t dd_packobj;
- uint64_t dd_bigobj;
- uint64_t dd_chunk;
-} dmu_read_write_dir_t;
-
void
-ztest_dmu_read_write(ztest_args_t *za)
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_read_write_dir_t dd;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[2];
dmu_tx_t *tx;
int i, freeit, error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
- uint64_t packoff, packsize, bigoff, bigsize;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 40;
@@ -1847,34 +3230,16 @@ ztest_dmu_read_write(ztest_args_t *za)
/*
* Read the directory info. If it's the first time, set things up.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (dd), &dd, DMU_READ_PREFETCH));
- if (dd.dd_chunk == 0) {
- ASSERT(dd.dd_packobj == 0);
- ASSERT(dd.dd_bigobj == 0);
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create r/w directory");
- dmu_tx_abort(tx);
- return;
- }
-
- dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
- ztest_set_random_blocksize(os, dd.dd_packobj, tx);
- ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
- tx);
- dmu_tx_commit(tx);
- }
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ chunksize = od[0].od_gen;
+ ASSERT(chunksize == od[1].od_gen);
/*
* Prefetch a random chunk of the big object.
@@ -1884,7 +3249,7 @@ ztest_dmu_read_write(ztest_args_t *za)
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(2 * width - 1);
- dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+ dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
/*
* Pick a random index and compute the offsets into packobj and bigobj.
@@ -1895,8 +3260,8 @@ ztest_dmu_read_write(ztest_args_t *za)
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
- bigoff = n * dd.dd_chunk;
- bigsize = s * dd.dd_chunk;
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
packbuf = umem_alloc(packsize, UMEM_NOFAIL);
bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
@@ -1910,10 +3275,10 @@ ztest_dmu_read_write(ztest_args_t *za)
/*
* Read the current contents of our objects.
*/
- error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
+ error = dmu_read(os, packobj, packoff, packsize, packbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
- error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
+ error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
@@ -1922,24 +3287,25 @@ ztest_dmu_read_write(ztest_args_t *za)
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
if (freeit)
- dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+ dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
else
- dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
- error = dmu_tx_assign(tx, TXG_WAIT);
-
- if (error) {
- ztest_record_enospc("dmu r/w range");
- dmu_tx_abort(tx);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
return;
}
- txg = dmu_tx_get_txg(tx);
+ dmu_object_set_checksum(os, bigobj,
+ (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+
+ dmu_object_set_compress(os, bigobj,
+ (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
/*
* For each index from n to n + s, verify that the existing bufwad
@@ -1951,9 +3317,9 @@ ztest_dmu_read_write(ztest_args_t *za)
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
- bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
- bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -1987,27 +3353,26 @@ ztest_dmu_read_write(ztest_args_t *za)
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
- dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
if (freeit) {
- if (zopt_verbose >= 6) {
+ if (zopt_verbose >= 7) {
(void) printf("freeing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
- bigsize, tx));
+ VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
} else {
- if (zopt_verbose >= 6) {
+ if (zopt_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+ dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
}
dmu_tx_commit(tx);
@@ -2019,9 +3384,9 @@ ztest_dmu_read_write(ztest_args_t *za)
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
- VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+ VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
- VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2037,7 +3402,7 @@ ztest_dmu_read_write(ztest_args_t *za)
void
compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
- uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
+ uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
{
uint64_t i;
bufwad_t *pack;
@@ -2054,9 +3419,9 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
- bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
- bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2085,22 +3450,24 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
}
void
-ztest_dmu_read_write_zcopy(ztest_args_t *za)
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_read_write_dir_t dd;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[2];
dmu_tx_t *tx;
uint64_t i;
int error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf;
- uint64_t packoff, packsize, bigoff, bigsize;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t blocksize = ztest_random_blocksize();
+ uint64_t chunksize = blocksize;
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 9;
dmu_buf_t *bonus_db;
arc_buf_t **bigbuf_arcbufs;
- dmu_object_info_t *doi = &za->za_doi;
+ dmu_object_info_t doi;
/*
* This test uses two objects, packobj and bigobj, that are always
@@ -2121,42 +3488,22 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
/*
* Read the directory info. If it's the first time, set things up.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (dd), &dd, DMU_READ_PREFETCH));
- if (dd.dd_chunk == 0) {
- ASSERT(dd.dd_packobj == 0);
- ASSERT(dd.dd_bigobj == 0);
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create r/w directory");
- dmu_tx_abort(tx);
- return;
- }
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
- dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, dd.dd_packobj, tx);
- ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
- VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
- ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
- ASSERT(ISP2(doi->doi_data_block_size));
- dd.dd_chunk = doi->doi_data_block_size;
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ blocksize = od[0].od_blocksize;
+ chunksize = blocksize;
+ ASSERT(chunksize == od[1].od_gen);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
- tx);
- dmu_tx_commit(tx);
- } else {
- VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
- VERIFY(ISP2(doi->doi_data_block_size));
- VERIFY(dd.dd_chunk == doi->doi_data_block_size);
- VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
- }
+ VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+ VERIFY(ISP2(doi.doi_data_block_size));
+ VERIFY(chunksize == doi.doi_data_block_size);
+ VERIFY(chunksize >= 2 * sizeof (bufwad_t));
/*
* Pick a random index and compute the offsets into packobj and bigobj.
@@ -2167,13 +3514,13 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
- bigoff = n * dd.dd_chunk;
- bigsize = s * dd.dd_chunk;
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
- VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
+ VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
@@ -2199,15 +3546,12 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
for (j = 0; j < s; j++) {
if (i != 5) {
bigbuf_arcbufs[j] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk);
+ dmu_request_arcbuf(bonus_db, chunksize);
} else {
bigbuf_arcbufs[2 * j] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk / 2);
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
bigbuf_arcbufs[2 * j + 1] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk / 2);
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
}
}
@@ -2216,20 +3560,11 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
- dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
- if (ztest_random(100) == 0) {
- error = -1;
- } else {
- error = dmu_tx_assign(tx, TXG_WAIT);
- }
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
- if (error) {
- if (error != -1) {
- ztest_record_enospc("dmu r/w range");
- }
- dmu_tx_abort(tx);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
@@ -2247,54 +3582,52 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
return;
}
- txg = dmu_tx_get_txg(tx);
-
/*
* 50% of the time don't read objects in the 1st iteration to
* test dmu_assign_arcbuf() for the case when there're no
* existing dbufs for the specified offsets.
*/
if (i != 0 || ztest_random(2) != 0) {
- error = dmu_read(os, dd.dd_packobj, packoff,
+ error = dmu_read(os, packobj, packoff,
packsize, packbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
- error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
+ error = dmu_read(os, bigobj, bigoff, bigsize,
bigbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
}
compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
- n, dd, txg);
+ n, chunksize, txg);
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
- dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
- if (zopt_verbose >= 6) {
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+ if (zopt_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
+ for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
bcopy((caddr_t)bigbuf + (off - bigoff),
- bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
+ bigbuf_arcbufs[j]->b_data, chunksize);
} else {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[2 * j]->b_data,
- dd.dd_chunk / 2);
+ chunksize / 2);
bcopy((caddr_t)bigbuf + (off - bigoff) +
- dd.dd_chunk / 2,
+ chunksize / 2,
bigbuf_arcbufs[2 * j + 1]->b_data,
- dd.dd_chunk / 2);
+ chunksize / 2);
}
if (i == 1) {
- VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
- FTAG, &dbt) == 0);
+ VERIFY(dmu_buf_hold(os, bigobj, off,
+ FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
dmu_assign_arcbuf(bonus_db, off,
@@ -2303,7 +3636,7 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[2 * j], tx);
dmu_assign_arcbuf(bonus_db,
- off + dd.dd_chunk / 2,
+ off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx);
}
if (i == 1) {
@@ -2319,9 +3652,9 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
- VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+ VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
- VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2343,256 +3676,60 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
}
+/* ARGSUSED */
void
-ztest_dmu_check_future_leak(ztest_args_t *za)
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_buf_t *db;
- ztest_block_tag_t *bt;
- dmu_object_info_t *doi = &za->za_doi;
-
- /*
- * Make sure that, if there is a write record in the bonus buffer
- * of the ZTEST_DIROBJ, that the txg for this record is <= the
- * last synced txg of the pool.
- */
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
- za->za_dbuf = db;
- VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
- ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
- ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
- ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
- bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
- if (bt->bt_objset != 0) {
- ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
- ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
- ASSERT3U(bt->bt_offset, ==, -1ULL);
- ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
- }
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-}
-
-void
-ztest_dmu_write_parallel(ztest_args_t *za)
-{
- objset_t *os = za->za_os;
- ztest_block_tag_t *rbt = &za->za_rbt;
- ztest_block_tag_t *wbt = &za->za_wbt;
- const size_t btsize = sizeof (ztest_block_tag_t);
- dmu_buf_t *db;
- int b, error;
- int bs = ZTEST_DIROBJ_BLOCKSIZE;
- int do_free = 0;
- uint64_t off, txg, txg_how;
- mutex_t *lp;
- char osname[MAXNAMELEN];
- char iobuf[SPA_MAXBLOCKSIZE];
- blkptr_t blk = { 0 };
- uint64_t blkoff;
- zbookmark_t zb;
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_buf_t *bonus_db;
- arc_buf_t *abuf = NULL;
-
- dmu_objset_name(os, osname);
+ ztest_od_t od[1];
+ uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
/*
- * Have multiple threads write to large offsets in ZTEST_DIROBJ
- * to verify that having multiple threads writing to the same object
- * in parallel doesn't cause any trouble.
+ * Have multiple threads write to large offsets in an object
+ * to verify that parallel writes to an object -- even to the
+ * same blocks within the object -- doesn't cause any trouble.
*/
- if (ztest_random(4) == 0) {
- /*
- * Do the bonus buffer instead of a regular block.
- * We need a lock to serialize resize vs. others,
- * so we hash on the objset ID.
- */
- b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
- off = -1ULL;
- dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
- } else {
- b = ztest_random(ZTEST_SYNC_LOCKS);
- off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
- if (ztest_random(4) == 0) {
- do_free = 1;
- dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
- } else {
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
- }
- }
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
- if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
- ztest_random(8) == 0) {
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
- abuf = dmu_request_arcbuf(bonus_db, bs);
- }
-
- txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
- error = dmu_tx_assign(tx, txg_how);
- if (error) {
- if (error == ERESTART) {
- ASSERT(txg_how == TXG_NOWAIT);
- dmu_tx_wait(tx);
- } else {
- ztest_record_enospc("dmu write parallel");
- }
- dmu_tx_abort(tx);
- if (abuf != NULL) {
- dmu_return_arcbuf(abuf);
- dmu_buf_rele(bonus_db, FTAG);
- }
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
- }
- txg = dmu_tx_get_txg(tx);
-
- lp = &ztest_shared->zs_sync_lock[b];
- (void) mutex_lock(lp);
-
- wbt->bt_objset = dmu_objset_id(os);
- wbt->bt_object = ZTEST_DIROBJ;
- wbt->bt_offset = off;
- wbt->bt_txg = txg;
- wbt->bt_thread = za->za_instance;
- wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */
-
- /*
- * Occasionally, write an all-zero block to test the behavior
- * of blocks that compress into holes.
- */
- if (off != -1ULL && ztest_random(8) == 0)
- bzero(wbt, btsize);
-
- if (off == -1ULL) {
- dmu_object_info_t *doi = &za->za_doi;
- char *dboff;
-
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
- za->za_dbuf = db;
- dmu_object_info_from_db(db, doi);
- ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
- ASSERT3U(doi->doi_bonus_size, >=, btsize);
- ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
- dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
- bcopy(dboff, rbt, btsize);
- if (rbt->bt_objset != 0) {
- ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
- ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
- ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
- ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
- }
- if (ztest_random(10) == 0) {
- int newsize = (ztest_random(db->db_size /
- btsize) + 1) * btsize;
-
- ASSERT3U(newsize, >=, btsize);
- ASSERT3U(newsize, <=, db->db_size);
- VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
- dboff = (char *)db->db_data + newsize - btsize;
- }
- dmu_buf_will_dirty(db, tx);
- bcopy(wbt, dboff, btsize);
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
- } else if (do_free) {
- VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
- } else if (abuf == NULL) {
- dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
- } else {
- bcopy(wbt, abuf->b_data, btsize);
- dmu_assign_arcbuf(bonus_db, off, abuf, tx);
- dmu_buf_rele(bonus_db, FTAG);
- }
- (void) mutex_unlock(lp);
-
- if (ztest_random(1000) == 0)
- (void) poll(NULL, 0, 1); /* open dn_notxholds window */
+ while (ztest_random(10) != 0)
+ ztest_io(zd, od[0].od_object, offset);
+}
- dmu_tx_commit(tx);
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_od_t od[1];
+ uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+ uint64_t count = ztest_random(20) + 1;
+ uint64_t blocksize = ztest_random_blocksize();
+ void *data;
- if (ztest_random(10000) == 0)
- txg_wait_synced(dmu_objset_pool(os), txg);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
- if (off == -1ULL || do_free)
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
- if (ztest_random(2) != 0)
+ if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
return;
- /*
- * dmu_sync() the block we just wrote.
- */
- (void) mutex_lock(lp);
-
- blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
- error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
- za->za_dbuf = db;
- if (error) {
- (void) mutex_unlock(lp);
- return;
- }
- blkoff = off - blkoff;
- error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
+ ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
- if (error) {
- (void) mutex_unlock(lp);
- return;
- }
+ data = umem_zalloc(blocksize, UMEM_NOFAIL);
- if (blk.blk_birth == 0) { /* concurrent free */
- (void) mutex_unlock(lp);
- return;
+ while (ztest_random(count) != 0) {
+ uint64_t randoff = offset + (ztest_random(count) * blocksize);
+ if (ztest_write(zd, od[0].od_object, randoff, blocksize,
+ data) != 0)
+ break;
+ while (ztest_random(4) != 0)
+ ztest_io(zd, od[0].od_object, randoff);
}
- txg_suspend(dmu_objset_pool(os));
-
- (void) mutex_unlock(lp);
-
- ASSERT(blk.blk_fill == 1);
- ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
- ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
- ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
-
- /*
- * Read the block that dmu_sync() returned to make sure its contents
- * match what we wrote. We do this while still txg_suspend()ed
- * to ensure that the block can't be reused before we read it.
- */
- zb.zb_objset = dmu_objset_id(os);
- zb.zb_object = ZTEST_DIROBJ;
- zb.zb_level = 0;
- zb.zb_blkid = off / bs;
- error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
- NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
- ASSERT3U(error, ==, 0);
-
- txg_resume(dmu_objset_pool(os));
-
- bcopy(&iobuf[blkoff], rbt, btsize);
-
- if (rbt->bt_objset == 0) /* concurrent free */
- return;
-
- if (wbt->bt_objset == 0) /* all-zero overwrite */
- return;
-
- ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
- ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
- ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-
- /*
- * The semantic of dmu_sync() is that we always push the most recent
- * version of the data, so in the face of concurrent updates we may
- * see a newer version of the block. That's OK.
- */
- ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
- if (rbt->bt_thread == wbt->bt_thread)
- ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
- else
- ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+ umem_free(data, blocksize);
}
/*
@@ -2603,9 +3740,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
#define ZTEST_ZAP_MAX_PROPS 1000
void
-ztest_zap(ztest_args_t *za)
+ztest_zap(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
uint64_t object;
uint64_t txg, last_txg;
uint64_t value[ZTEST_ZAP_MAX_INTS];
@@ -2614,64 +3752,45 @@ ztest_zap(ztest_args_t *za)
dmu_tx_t *tx;
char propname[100], txgname[100];
int error;
- char osname[MAXNAMELEN];
char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
- dmu_objset_name(os, osname);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
- /*
- * Create a new object if necessary, and record it in the directory.
- */
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+ return;
- if (object == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap test obj");
- dmu_tx_abort(tx);
- return;
- }
- object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
- if (error) {
- fatal(0, "zap_create('%s', %llu) = %d",
- osname, object, error);
- }
- ASSERT(object != 0);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, tx);
- /*
- * Generate a known hash collision, and verify that
- * we can lookup and remove both entries.
- */
- for (i = 0; i < 2; i++) {
- value[i] = i;
- error = zap_add(os, object, hc[i], sizeof (uint64_t),
- 1, &value[i], tx);
- ASSERT3U(error, ==, 0);
- }
- for (i = 0; i < 2; i++) {
- error = zap_add(os, object, hc[i], sizeof (uint64_t),
- 1, &value[i], tx);
- ASSERT3U(error, ==, EEXIST);
- error = zap_length(os, object, hc[i],
- &zl_intsize, &zl_ints);
- ASSERT3U(error, ==, 0);
- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
- ASSERT3U(zl_ints, ==, 1);
- }
- for (i = 0; i < 2; i++) {
- error = zap_remove(os, object, hc[i], tx);
- ASSERT3U(error, ==, 0);
- }
+ object = od[0].od_object;
- dmu_tx_commit(tx);
+ /*
+ * Generate a known hash collision, and verify that
+ * we can lookup and remove both entries.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ return;
+ for (i = 0; i < 2; i++) {
+ value[i] = i;
+ VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+ 1, &value[i], tx));
+ }
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+ sizeof (uint64_t), 1, &value[i], tx));
+ VERIFY3U(0, ==,
+ zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
}
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+ }
+ dmu_tx_commit(tx);
+ /*
+ * Generate a buch of random entries.
+ */
ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
@@ -2715,14 +3834,10 @@ ztest_zap(ztest_args_t *za)
* should be txg + object + n.
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- txg = dmu_tx_get_txg(tx);
if (last_txg > txg)
fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
@@ -2730,16 +3845,10 @@ ztest_zap(ztest_args_t *za)
for (i = 0; i < ints; i++)
value[i] = txg + object + i;
- error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
-
- error = zap_update(os, object, propname, sizeof (uint64_t),
- ints, value, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, propname, error);
+ VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+ 1, &txg, tx));
+ VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+ ints, value, tx));
dmu_tx_commit(tx);
@@ -2758,47 +3867,12 @@ ztest_zap(ztest_args_t *za)
ASSERT3U(error, ==, 0);
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("remove zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- error = zap_remove(os, object, txgname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
-
- error = zap_remove(os, object, propname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, propname, error);
-
- dmu_tx_commit(tx);
-
- /*
- * Once in a while, destroy the object.
- */
- if (ztest_random(1000) != 0)
- return;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("destroy zap object");
- dmu_tx_abort(tx);
- return;
- }
- error = zap_destroy(os, object, tx);
- if (error)
- fatal(0, "zap_destroy('%s', %llu) = %d",
- osname, object, error);
- object = 0;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- &object, tx);
+ VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+ VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
dmu_tx_commit(tx);
}
@@ -2806,108 +3880,65 @@ ztest_zap(ztest_args_t *za)
* Testcase to test the upgrading of a microzap to fatzap.
*/
void
-ztest_fzap(ztest_args_t *za)
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- uint64_t object;
- uint64_t value;
- dmu_tx_t *tx;
- int i, error;
- char osname[MAXNAMELEN];
- char *name = "aaa";
- char entname[MAXNAMELEN];
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, txg;
- dmu_objset_name(os, osname);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
- /*
- * Create a new object if necessary, and record it in the directory.
- */
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+ return;
- if (object == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap test obj");
- dmu_tx_abort(tx);
- return;
- }
- object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
- if (error) {
- fatal(0, "zap_create('%s', %llu) = %d",
- osname, object, error);
- }
- ASSERT(object != 0);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, tx);
- dmu_tx_commit(tx);
- }
+ object = od[0].od_object;
/*
- * Add entries to this ZAP amd make sure it spills over
+ * Add entries to this ZAP and make sure it spills over
* and gets upgraded to a fatzap. Also, since we are adding
- * 2050 entries we should see ptrtbl growth and leaf-block
- * split.
+ * 2050 entries we should see ptrtbl growth and leaf-block split.
*/
- for (i = 0; i < 2050; i++) {
- (void) snprintf(entname, sizeof (entname), "%s-%d", name, i);
- value = i;
+ for (int i = 0; i < 2050; i++) {
+ char name[MAXNAMELEN];
+ uint64_t value = i;
+ dmu_tx_t *tx;
+ int error;
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, entname);
- error = dmu_tx_assign(tx, TXG_WAIT);
+ (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+ id, value);
- if (error) {
- ztest_record_enospc("create zap entry");
- dmu_tx_abort(tx);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, name);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- error = zap_add(os, object, entname, sizeof (uint64_t),
- 1, &value, tx);
-
+ error = zap_add(os, object, name, sizeof (uint64_t), 1,
+ &value, tx);
ASSERT(error == 0 || error == EEXIST);
dmu_tx_commit(tx);
}
-
- /*
- * Once in a while, destroy the object.
- */
- if (ztest_random(1000) != 0)
- return;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("destroy zap object");
- dmu_tx_abort(tx);
- return;
- }
- error = zap_destroy(os, object, tx);
- if (error)
- fatal(0, "zap_destroy('%s', %llu) = %d",
- osname, object, error);
- object = 0;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- &object, tx);
- dmu_tx_commit(tx);
}
+/* ARGSUSED */
void
-ztest_zap_parallel(ztest_args_t *za)
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
dmu_tx_t *tx;
int i, namelen, error;
+ int micro = ztest_random(2);
char name[20], string_value[20];
void *data;
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ object = od[0].od_object;
+
/*
* Generate a random name of the form 'xxx.....' where each
* x is a random printable character and the dots are dots.
@@ -2922,12 +3953,7 @@ ztest_zap_parallel(ztest_args_t *za)
name[i] = '.';
name[i] = '\0';
- if (ztest_random(2) == 0)
- object = ZTEST_MICROZAP_OBJ;
- else
- object = ZTEST_FATZAP_OBJ;
-
- if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+ if ((namelen & 1) || micro) {
wsize = sizeof (txg);
wc = 1;
data = &txg;
@@ -2948,14 +3974,10 @@ ztest_zap_parallel(ztest_args_t *za)
if (i >= 2) {
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("zap parallel");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- txg = dmu_tx_get_txg(tx);
bcopy(name, string_value, namelen);
} else {
tx = NULL;
@@ -3006,79 +4028,400 @@ ztest_zap_parallel(ztest_args_t *za)
dmu_tx_commit(tx);
}
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+ list_node_t zcd_node;
+ uint64_t zcd_txg;
+ int zcd_expected_err;
+ boolean_t zcd_added;
+ boolean_t zcd_called;
+ spa_t *zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+ ztest_cb_data_t *data = arg;
+ uint64_t synced_txg;
+
+ VERIFY(data != NULL);
+ VERIFY3S(data->zcd_expected_err, ==, error);
+ VERIFY(!data->zcd_called);
+
+ synced_txg = spa_last_synced_txg(data->zcd_spa);
+ if (data->zcd_txg > synced_txg)
+ fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+ ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+ synced_txg);
+
+ data->zcd_called = B_TRUE;
+
+ if (error == ECANCELED) {
+ ASSERT3U(data->zcd_txg, ==, 0);
+ ASSERT(!data->zcd_added);
+
+ /*
+ * The private callback data should be destroyed here, but
+ * since we are going to check the zcd_called field after
+ * dmu_tx_abort(), we will destroy it there.
+ */
+ return;
+ }
+
+ /* Was this callback added to the global callback list? */
+ if (!data->zcd_added)
+ goto out;
+
+ ASSERT3U(data->zcd_txg, !=, 0);
+
+ /* Remove our callback from the list */
+ (void) mutex_lock(&zcl.zcl_callbacks_lock);
+ list_remove(&zcl.zcl_callbacks, data);
+ (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+out:
+ umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+ ztest_cb_data_t *cb_data;
+
+ cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+ cb_data->zcd_txg = txg;
+ cb_data->zcd_spa = dmu_objset_spa(os);
+
+ return (cb_data);
+}
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)
+
+/*
+ * Commit callback test.
+ */
void
-ztest_dsl_prop_get_set(ztest_args_t *za)
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ dmu_tx_t *tx;
+ ztest_cb_data_t *cb_data[3], *tmp_cb;
+ uint64_t old_txg, txg;
+ int i, error;
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ tx = dmu_tx_create(os);
+
+ cb_data[0] = ztest_create_cb_data(os, 0);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+ dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+
+ /* Every once in a while, abort the transaction on purpose */
+ if (ztest_random(100) == 0)
+ error = -1;
+
+ if (!error)
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+ txg = error ? 0 : dmu_tx_get_txg(tx);
+
+ cb_data[0]->zcd_txg = txg;
+ cb_data[1] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
+ if (error) {
+ /*
+ * It's not a strict requirement to call the registered
+ * callbacks from inside dmu_tx_abort(), but that's what
+ * it's supposed to happen in the current implementation
+ * so we will check for that.
+ */
+ for (i = 0; i < 2; i++) {
+ cb_data[i]->zcd_expected_err = ECANCELED;
+ VERIFY(!cb_data[i]->zcd_called);
+ }
+
+ dmu_tx_abort(tx);
+
+ for (i = 0; i < 2; i++) {
+ VERIFY(cb_data[i]->zcd_called);
+ umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+ }
+
+ return;
+ }
+
+ cb_data[2] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
+
+ /*
+ * Read existing data to make sure there isn't a future leak.
+ */
+ VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+ &old_txg, DMU_READ_PREFETCH));
+
+ if (old_txg > txg)
+ fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+ old_txg, txg);
+
+ dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
+
+ (void) mutex_lock(&zcl.zcl_callbacks_lock);
+
+ /*
+ * Since commit callbacks don't have any ordering requirement and since
+ * it is theoretically possible for a commit callback to be called
+ * after an arbitrary amount of time has elapsed since its txg has been
+ * synced, it is difficult to reliably determine whether a commit
+ * callback hasn't been called due to high load or due to a flawed
+ * implementation.
+ *
+ * In practice, we will assume that if after a certain number of txgs a
+ * commit callback hasn't been called, then most likely there's an
+ * implementation bug..
+ */
+ tmp_cb = list_head(&zcl.zcl_callbacks);
+ if (tmp_cb != NULL &&
+ tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+ fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+ PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+ }
+
+ /*
+ * Let's find the place to insert our callbacks.
+ *
+ * Even though the list is ordered by txg, it is possible for the
+ * insertion point to not be the end because our txg may already be
+ * quiescing at this point and other callbacks in the open txg
+ * (from other objsets) may have sneaked in.
+ */
+ tmp_cb = list_tail(&zcl.zcl_callbacks);
+ while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+ tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+ /* Add the 3 callbacks to the list */
+ for (i = 0; i < 3; i++) {
+ if (tmp_cb == NULL)
+ list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+ else
+ list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+ cb_data[i]);
+
+ cb_data[i]->zcd_added = B_TRUE;
+ VERIFY(!cb_data[i]->zcd_called);
+
+ tmp_cb = cb_data[i];
+ }
+
+ (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+ dmu_tx_commit(tx);
+}
+
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+ zfs_prop_t proplist[] = {
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_COPIES,
+ ZFS_PROP_DEDUP
+ };
+ ztest_shared_t *zs = ztest_shared;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+ ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+ (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ nvlist_t *props = NULL;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+ ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+
+ VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+ if (zopt_verbose >= 6)
+ dump_nvlist(props, 4);
+
+ nvlist_free(props);
+
+ (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
+void
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- int i, inherit;
- uint64_t value;
- const char *prop, *valname;
- char setpoint[MAXPATHLEN];
- char osname[MAXNAMELEN];
int error;
+ objset_t *os = zd->zd_os;
+ objset_t *origin;
+ char snapname[100];
+ char fullname[100];
+ char clonename[100];
+ char tag[100];
+ char osname[MAXNAMELEN];
(void) rw_rdlock(&ztest_shared->zs_name_lock);
dmu_objset_name(os, osname);
- for (i = 0; i < 2; i++) {
- if (i == 0) {
- prop = "checksum";
- value = ztest_random_checksum();
- inherit = (value == ZIO_CHECKSUM_INHERIT);
- } else {
- prop = "compression";
- value = ztest_random_compress();
- inherit = (value == ZIO_COMPRESS_INHERIT);
+ (void) snprintf(snapname, 100, "sh1_%llu", id);
+ (void) snprintf(fullname, 100, "%s@%s", osname, snapname);
+ (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
+ (void) snprintf(tag, 100, "%tag_%llu", id);
+
+ /*
+ * Clean up from any previous run.
+ */
+ (void) dmu_objset_destroy(clonename, B_FALSE);
+ (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+ (void) dmu_objset_destroy(fullname, B_FALSE);
+
+ /*
+ * Create snapshot, clone it, mark snap for deferred destroy,
+ * destroy clone, verify snap was also destroyed.
+ */
+ error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
+ FALSE, -1);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_snapshot");
+ goto out;
}
+ fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+ }
- error = dsl_prop_set(osname, prop, sizeof (value),
- !inherit, &value);
+ error = dmu_objset_hold(fullname, FTAG, &origin);
+ if (error)
+ fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
+ error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
+ dmu_objset_rele(origin, FTAG);
+ if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dsl_prop_set");
- break;
+ ztest_record_enospc("dmu_objset_clone");
+ goto out;
}
+ fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+ }
- ASSERT3U(error, ==, 0);
+ error = dmu_objset_destroy(fullname, B_TRUE);
+ if (error) {
+ fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+ fullname, error);
+ }
- VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
- 1, &value, setpoint), ==, 0);
+ error = dmu_objset_destroy(clonename, B_FALSE);
+ if (error)
+ fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
- if (i == 0)
- valname = zio_checksum_table[value].ci_name;
- else
- valname = zio_compress_table[value].ci_name;
+ error = dmu_objset_hold(fullname, FTAG, &origin);
+ if (error != ENOENT)
+ fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
- if (zopt_verbose >= 6) {
- (void) printf("%s %s = %s for '%s'\n",
- osname, prop, valname, setpoint);
+ /*
+ * Create snapshot, add temporary hold, verify that we can't
+ * destroy a held snapshot, mark for deferred destroy,
+ * release hold, verify snapshot was destroyed.
+ */
+ error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
+ FALSE, -1);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_snapshot");
+ goto out;
}
+ fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+ }
+
+ error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE,
+ B_TRUE, -1);
+ if (error)
+ fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+
+ error = dmu_objset_destroy(fullname, B_FALSE);
+ if (error != EBUSY) {
+ fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+ fullname, error);
+ }
+
+ error = dmu_objset_destroy(fullname, B_TRUE);
+ if (error) {
+ fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+ fullname, error);
}
+ error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+ if (error)
+ fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
+
+ VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+
+out:
(void) rw_unlock(&ztest_shared->zs_name_lock);
}
/*
* Inject random faults into the on-disk data.
*/
+/* ARGSUSED */
void
-ztest_fault_inject(ztest_args_t *za)
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
int fd;
uint64_t offset;
- uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t leaves;
uint64_t bad = 0x1990c0ffeedecadeULL;
uint64_t top, leaf;
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
- spa_t *spa = za->za_spa;
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int iters = 1000;
- int maxfaults = zopt_maxfaults;
+ int maxfaults;
+ int mirror_save;
vdev_t *vd0 = NULL;
uint64_t guid0 = 0;
+ boolean_t islog = B_FALSE;
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ maxfaults = MAXFAULTS();
+ leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
+ mirror_save = zs->zs_mirrors;
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
ASSERT(leaves >= 1);
@@ -3089,10 +4432,10 @@ ztest_fault_inject(ztest_args_t *za)
if (ztest_random(2) == 0) {
/*
- * Inject errors on a normal data device.
+ * Inject errors on a normal data device or slog device.
*/
- top = ztest_random(spa->spa_root_vdev->vdev_children);
- leaf = ztest_random(leaves);
+ top = ztest_random_vdev_top(spa, B_TRUE);
+ leaf = ztest_random(leaves) + zs->zs_splits;
/*
* Generate paths to the first leaf in this top-level vdev,
@@ -3101,11 +4444,14 @@ ztest_fault_inject(ztest_args_t *za)
* and we'll write random garbage to the randomly chosen leaf.
*/
(void) snprintf(path0, sizeof (path0), ztest_dev_template,
- zopt_dir, zopt_pool, top * leaves + 0);
+ zopt_dir, zopt_pool, top * leaves + zs->zs_splits);
(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
zopt_dir, zopt_pool, top * leaves + leaf);
vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+ if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+ islog = B_TRUE;
+
if (vd0 != NULL && maxfaults != 1) {
/*
* Make vd0 explicitly claim to be unreadable,
@@ -3151,22 +4497,38 @@ ztest_fault_inject(ztest_args_t *za)
spa_config_exit(spa, SCL_STATE, FTAG);
- if (maxfaults == 0)
- return;
-
/*
- * If we can tolerate two or more faults, randomly online/offline vd0.
+ * If we can tolerate two or more faults, or we're dealing
+ * with a slog, randomly online/offline vd0.
*/
- if (maxfaults >= 2 && guid0 != 0) {
+ if ((maxfaults >= 2 || islog) && guid0 != 0) {
if (ztest_random(10) < 6) {
int flags = (ztest_random(2) == 0 ?
ZFS_OFFLINE_TEMPORARY : 0);
+
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between offlining a slog and
+ * destroying a dataset. Offlining the slog will
+ * grab a reference on the dataset which may cause
+ * dmu_objset_destroy() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ if (islog)
+ (void) rw_wrlock(&ztest_shared->zs_name_lock);
+
VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+ if (islog)
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
} else {
(void) vdev_online(spa, guid0, 0, NULL);
}
}
+ if (maxfaults == 0)
+ return;
+
/*
* We have at least single-fault tolerance, so inject data corruption.
*/
@@ -3185,173 +4547,198 @@ ztest_fault_inject(ztest_args_t *za)
if (offset >= fsize)
continue;
- if (zopt_verbose >= 6)
- (void) printf("injecting bad word into %s,"
- " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ if (mirror_save != zs->zs_mirrors) {
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ (void) close(fd);
+ return;
+ }
if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
fatal(1, "can't inject bad word at 0x%llx in %s",
offset, pathrand);
+
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
+ if (zopt_verbose >= 7)
+ (void) printf("injected bad word into %s,"
+ " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
}
(void) close(fd);
}
/*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
*/
void
-ztest_scrub(ztest_args_t *za)
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
-
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-}
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, blocksize, txg, pattern, psize;
+ enum zio_checksum checksum = spa_dedup_checksum(spa);
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ void *buf;
+ blkptr_t blk;
+ int copies = 2 * ZIO_DEDUPDITTO_MIN;
-/*
- * Rename the pool to a different name and then rename it back.
- */
-void
-ztest_spa_rename(ztest_args_t *za)
-{
- char *oldname, *newname;
- int error;
- spa_t *spa;
+ blocksize = ztest_random_blocksize();
+ blocksize = MIN(blocksize, 2048); /* because we write so many */
- (void) rw_wrlock(&ztest_shared->zs_name_lock);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
- oldname = za->za_pool;
- newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
- (void) strcpy(newname, oldname);
- (void) strcat(newname, "_tmp");
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
/*
- * Do the rename
+ * Take the name lock as writer to prevent anyone else from changing
+ * the pool and dataset properies we need to maintain during this test.
*/
- error = spa_rename(oldname, newname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", oldname,
- newname, error);
+ (void) rw_wrlock(&zs->zs_name_lock);
- /*
- * Try to open it under the old name, which shouldn't exist
- */
- error = spa_open(oldname, &spa, FTAG);
- if (error != ENOENT)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+ B_FALSE) != 0 ||
+ ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+ B_FALSE) != 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
+
+ object = od[0].od_object;
+ blocksize = od[0].od_blocksize;
+ pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+ ASSERT(object != 0);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
/*
- * Open it under the new name and make sure it's still the same spa_t.
+ * Write all the copies of our block.
*/
- error = spa_open(newname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", newname, error);
+ for (int i = 0; i < copies; i++) {
+ uint64_t offset = i * blocksize;
+ VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
+ DMU_READ_NO_PREFETCH) == 0);
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == blocksize);
+ ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+ ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+ dmu_buf_will_fill(db, tx);
+ ztest_pattern_set(db->db_data, db->db_size, pattern);
+ dmu_buf_rele(db, FTAG);
+ }
- ASSERT(spa == za->za_spa);
- spa_close(spa, FTAG);
+ dmu_tx_commit(tx);
+ txg_wait_synced(spa_get_dsl(spa), txg);
/*
- * Rename it back to the original
+ * Find out what block we got.
*/
- error = spa_rename(newname, oldname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", newname,
- oldname, error);
+ VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
+ DMU_READ_NO_PREFETCH) == 0);
+ blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+ dmu_buf_rele(db, FTAG);
/*
- * Make sure it can still be opened
+ * Damage the block. Dedup-ditto will save us when we read it later.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ psize = BP_GET_PSIZE(&blk);
+ buf = zio_buf_alloc(psize);
+ ztest_pattern_set(buf, psize, ~pattern);
- ASSERT(spa == za->za_spa);
- spa_close(spa, FTAG);
+ (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+ buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
- umem_free(newname, strlen(newname) + 1);
+ zio_buf_free(buf, psize);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ (void) rw_unlock(&zs->zs_name_lock);
}
-
/*
- * Completely obliterate one disk.
+ * Scrub the pool.
*/
-static void
-ztest_obliterate_one_disk(uint64_t vdev)
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
- int fd;
- char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
- size_t fsize;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
- if (zopt_maxfaults < 2)
- return;
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+ (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+}
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
- (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+/* ARGSUSED */
+void
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ char *oldname, *newname;
+ spa_t *spa;
- fd = open(dev_name, O_RDWR);
+ (void) rw_wrlock(&zs->zs_name_lock);
- if (fd == -1)
- fatal(1, "can't open %s", dev_name);
+ oldname = zs->zs_pool;
+ newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+ (void) strcpy(newname, oldname);
+ (void) strcat(newname, "_tmp");
/*
- * Determine the size.
+ * Do the rename
*/
- fsize = lseek(fd, 0, SEEK_END);
-
- (void) close(fd);
+ VERIFY3U(0, ==, spa_rename(oldname, newname));
/*
- * Rename the old device to dev_name.old (useful for debugging).
+ * Try to open it under the old name, which shouldn't exist
*/
- VERIFY(rename(dev_name, copy_name) == 0);
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
- * Create a new one.
+ * Open it under the new name and make sure it's still the same spa_t.
*/
- VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
- VERIFY(ftruncate(fd, fsize) == 0);
- (void) close(fd);
-}
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
-static void
-ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
-{
- char dev_name[MAXPATHLEN];
- nvlist_t *root;
- int error;
- uint64_t guid;
- vdev_t *vd;
+ ASSERT(spa == zs->zs_spa);
+ spa_close(spa, FTAG);
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+ /*
+ * Rename it back to the original
+ */
+ VERIFY3U(0, ==, spa_rename(newname, oldname));
/*
- * Build the nvlist describing dev_name.
+ * Make sure it can still be opened
*/
- root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
- guid = 0;
- else
- guid = vd->vdev_guid;
- spa_config_exit(spa, SCL_VDEV, FTAG);
- error = spa_vdev_attach(spa, guid, root, B_TRUE);
- if (error != 0 &&
- error != EBUSY &&
- error != ENOTSUP &&
- error != ENODEV &&
- error != EDOM)
- fatal(0, "spa_vdev_attach(in-place) = %d", error);
+ ASSERT(spa == zs->zs_spa);
+ spa_close(spa, FTAG);
- nvlist_free(root);
+ umem_free(newname, strlen(newname) + 1);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
+/*
+ * Verify pool integrity by running zdb.
+ */
static void
-ztest_verify_blocks(char *pool)
+ztest_run_zdb(char *pool)
{
int status;
char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@ -3372,11 +4759,12 @@ ztest_verify_blocks(char *pool)
isa = strdup(isa);
/* LINTED */
(void) sprintf(bin,
- "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
+ "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s",
isalen,
isa,
zopt_verbose >= 3 ? "s" : "",
zopt_verbose >= 4 ? "v" : "",
+ spa_config_path,
pool);
free(isa);
@@ -3423,7 +4811,6 @@ ztest_spa_import_export(char *oldname, char *newname)
nvlist_t *config, *newconfig;
uint64_t pool_guid;
spa_t *spa;
- int error;
if (zopt_verbose >= 4) {
(void) printf("import/export: old = %s, new = %s\n",
@@ -3438,15 +4825,13 @@ ztest_spa_import_export(char *oldname, char *newname)
/*
* Get the pool's configuration and guid.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
/*
* Kick off a scrub to tickle scrub/export races.
*/
if (ztest_random(2) == 0)
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
pool_guid = spa_guid(spa);
spa_close(spa, FTAG);
@@ -3456,9 +4841,7 @@ ztest_spa_import_export(char *oldname, char *newname)
/*
* Export it.
*/
- error = spa_export(oldname, &config, B_FALSE, B_FALSE);
- if (error)
- fatal(0, "spa_export('%s') = %d", oldname, error);
+ VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
ztest_walk_pool_directory("pools after export");
@@ -3472,39 +4855,29 @@ ztest_spa_import_export(char *oldname, char *newname)
/*
* Import it under the new name.
*/
- error = spa_import(newname, config, NULL);
- if (error)
- fatal(0, "spa_import('%s') = %d", newname, error);
+ VERIFY3U(0, ==, spa_import(newname, config, NULL, 0));
ztest_walk_pool_directory("pools after import");
/*
* Try to import it again -- should fail with EEXIST.
*/
- error = spa_import(newname, config, NULL);
- if (error != EEXIST)
- fatal(0, "spa_import('%s') twice", newname);
+ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
/*
* Try to import it under a different name -- should fail with EEXIST.
*/
- error = spa_import(oldname, config, NULL);
- if (error != EEXIST)
- fatal(0, "spa_import('%s') under multiple names", newname);
+ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
/*
* Verify that the pool is no longer visible under the old name.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != ENOENT)
- fatal(0, "spa_open('%s') = %d", newname, error);
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
* Verify that we can open and close the pool using the new name.
*/
- error = spa_open(newname, &spa, FTAG);
- if (error)
- fatal(0, "spa_open('%s') = %d", newname, error);
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
ASSERT(pool_guid == spa_guid(spa));
spa_close(spa, FTAG);
@@ -3514,12 +4887,12 @@ ztest_spa_import_export(char *oldname, char *newname)
static void
ztest_resume(spa_t *spa)
{
- if (spa_suspended(spa)) {
- spa_vdev_state_enter(spa);
- vdev_clear(spa, NULL);
- (void) spa_vdev_state_exit(spa, NULL, 0);
- (void) zio_resume(spa);
- }
+ if (spa_suspended(spa) && zopt_verbose >= 6)
+ (void) printf("resuming from suspended state\n");
+ spa_vdev_state_enter(spa, SCL_NONE);
+ vdev_clear(spa, NULL);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ (void) zio_resume(spa);
}
static void *
@@ -3528,155 +4901,252 @@ ztest_resume_thread(void *arg)
spa_t *spa = arg;
while (!ztest_exiting) {
- (void) poll(NULL, 0, 1000);
- ztest_resume(spa);
+ if (spa_suspended(spa))
+ ztest_resume(spa);
+ (void) poll(NULL, 0, 100);
}
return (NULL);
}
static void *
+ztest_deadman_thread(void *arg)
+{
+ ztest_shared_t *zs = arg;
+ int grace = 300;
+ hrtime_t delta;
+
+ delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+ (void) poll(NULL, 0, (int)(1000 * delta));
+
+ fatal(0, "failed to complete within %d seconds of deadline", grace);
+
+ return (NULL);
+}
+
+static void
+ztest_execute(ztest_info_t *zi, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+ hrtime_t functime = gethrtime();
+
+ for (int i = 0; i < zi->zi_iters; i++)
+ zi->zi_func(zd, id);
+
+ functime = gethrtime() - functime;
+
+ atomic_add_64(&zi->zi_call_count, 1);
+ atomic_add_64(&zi->zi_call_time, functime);
+
+ if (zopt_verbose >= 4) {
+ Dl_info dli;
+ (void) dladdr((void *)zi->zi_func, &dli);
+ (void) printf("%6.2f sec in %s\n",
+ (double)functime / NANOSEC, dli.dli_sname);
+ }
+}
+
+static void *
ztest_thread(void *arg)
{
- ztest_args_t *za = arg;
+ uint64_t id = (uintptr_t)arg;
ztest_shared_t *zs = ztest_shared;
- hrtime_t now, functime;
+ uint64_t call_next;
+ hrtime_t now;
ztest_info_t *zi;
- int f, i;
- while ((now = gethrtime()) < za->za_stop) {
+ while ((now = gethrtime()) < zs->zs_thread_stop) {
/*
* See if it's time to force a crash.
*/
- if (now > za->za_kill) {
- zs->zs_alloc = spa_get_alloc(za->za_spa);
- zs->zs_space = spa_get_space(za->za_spa);
- (void) kill(getpid(), SIGKILL);
- }
+ if (now > zs->zs_thread_kill)
+ ztest_kill(zs);
/*
- * Pick a random function.
+ * If we're getting ENOSPC with some regularity, stop.
*/
- f = ztest_random(ZTEST_FUNCS);
- zi = &zs->zs_info[f];
+ if (zs->zs_enospc_count > 10)
+ break;
/*
- * Decide whether to call it, based on the requested frequency.
+ * Pick a random function to execute.
*/
- if (zi->zi_call_target == 0 ||
- (double)zi->zi_call_total / zi->zi_call_target >
- (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
- continue;
+ zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+ call_next = zi->zi_call_next;
- atomic_add_64(&zi->zi_calls, 1);
- atomic_add_64(&zi->zi_call_total, 1);
+ if (now >= call_next &&
+ atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+ ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+ ztest_execute(zi, id);
+ }
- za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
- ZTEST_DIRSIZE;
- za->za_diroff_shared = (1ULL << 63);
+ return (NULL);
+}
- for (i = 0; i < zi->zi_iters; i++)
- zi->zi_func(za);
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+ (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
- functime = gethrtime() - now;
+static void
+ztest_dataset_destroy(ztest_shared_t *zs, int d)
+{
+ char name[MAXNAMELEN];
- atomic_add_64(&zi->zi_call_time, functime);
+ ztest_dataset_name(name, zs->zs_pool, d);
- if (zopt_verbose >= 4) {
- Dl_info dli;
- (void) dladdr((void *)zi->zi_func, &dli);
- (void) printf("%6.2f sec in %s\n",
- (double)functime / NANOSEC, dli.dli_sname);
- }
+ if (zopt_verbose >= 3)
+ (void) printf("Destroying %s to free up space\n", name);
- /*
- * If we're getting ENOSPC with some regularity, stop.
- */
- if (zs->zs_enospc_count > 10)
- break;
+ /*
+ * Cleanup any non-standard clones and snapshots. In general,
+ * ztest thread t operates on dataset (t % zopt_datasets),
+ * so there may be more than one thing to clean up.
+ */
+ for (int t = d; t < zopt_threads; t += zopt_datasets)
+ ztest_dsl_dataset_cleanup(name, t);
+
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+ uint64_t usedobjs, dirobjs, scratch;
+
+ /*
+ * ZTEST_DIROBJ is the object directory for the entire dataset.
+ * Therefore, the number of objects in use should equal the
+ * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+ * If not, we have an object leak.
+ *
+ * Note that we can only check this in ztest_dataset_open(),
+ * when the open-context and syncing-context values agree.
+ * That's because zap_count() returns the open-context value,
+ * while dmu_objset_space() returns the rootbp fill count.
+ */
+ VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+ dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+ ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(ztest_shared_t *zs, int d)
+{
+ ztest_ds_t *zd = &zs->zs_zd[d];
+ uint64_t committed_seq = zd->zd_seq;
+ objset_t *os;
+ zilog_t *zilog;
+ char name[MAXNAMELEN];
+ int error;
+
+ ztest_dataset_name(name, zs->zs_pool, d);
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ error = ztest_dataset_create(name);
+ if (error == ENOSPC) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ ztest_record_enospc(FTAG);
+ return (error);
}
+ ASSERT(error == 0 || error == EEXIST);
- return (NULL);
+ VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+ (void) rw_unlock(&zs->zs_name_lock);
+
+ ztest_zd_init(zd, os);
+
+ zilog = zd->zd_zilog;
+
+ if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+ zilog->zl_header->zh_claim_lr_seq < committed_seq)
+ fatal(0, "missing log records: claimed %llu < committed %llu",
+ zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ zil_replay(os, zd, ztest_replay_vector);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ if (zopt_verbose >= 6)
+ (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ zd->zd_name,
+ (u_longlong_t)zilog->zl_parse_blk_count,
+ (u_longlong_t)zilog->zl_parse_lr_count,
+ (u_longlong_t)zilog->zl_replaying_seq);
+
+ zilog = zil_open(os, ztest_get_data);
+
+ if (zilog->zl_replaying_seq != 0 &&
+ zilog->zl_replaying_seq < committed_seq)
+ fatal(0, "missing log records: replayed %llu < committed %llu",
+ zilog->zl_replaying_seq, committed_seq);
+
+ return (0);
+}
+
+static void
+ztest_dataset_close(ztest_shared_t *zs, int d)
+{
+ ztest_ds_t *zd = &zs->zs_zd[d];
+
+ zil_close(zd->zd_zilog);
+ dmu_objset_rele(zd->zd_os, zd);
+
+ ztest_zd_fini(zd);
}
/*
* Kick off threads to run tests on all datasets in parallel.
*/
static void
-ztest_run(char *pool)
+ztest_run(ztest_shared_t *zs)
{
- int t, d, error;
- ztest_shared_t *zs = ztest_shared;
- ztest_args_t *za;
+ thread_t *tid;
spa_t *spa;
- char name[100];
thread_t resume_tid;
+ int error;
ztest_exiting = B_FALSE;
- (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
- (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
-
- for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
- (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
-
- /*
- * Destroy one disk before we even start.
- * It's mirrored, so everything should work just fine.
- * This makes us exercise fault handling very early in spa_load().
- */
- ztest_obliterate_one_disk(0);
-
- /*
- * Verify that the sum of the sizes of all blocks in the pool
- * equals the SPA's allocated space total.
- */
- ztest_verify_blocks(pool);
-
/*
- * Kick off a replacement of the disk we just obliterated.
+ * Initialize parent/child shared state.
*/
- kernel_init(FREAD | FWRITE);
- VERIFY(spa_open(pool, &spa, FTAG) == 0);
- ztest_replace_one_disk(spa, 0);
- if (zopt_verbose >= 5)
- show_pool_stats(spa);
- spa_close(spa, FTAG);
- kernel_fini();
+ VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
- kernel_init(FREAD | FWRITE);
+ zs->zs_thread_start = gethrtime();
+ zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+ zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+ zs->zs_thread_kill = zs->zs_thread_stop;
+ if (ztest_random(100) < zopt_killrate)
+ zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
- /*
- * Verify that we can export the pool and reimport it under a
- * different name.
- */
- if (ztest_random(2) == 0) {
- (void) snprintf(name, 100, "%s_import", pool);
- ztest_spa_import_export(pool, name);
- ztest_spa_import_export(name, pool);
- }
+ (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
- /*
- * Verify that we can loop over all pools.
- */
- mutex_enter(&spa_namespace_lock);
- for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
- if (zopt_verbose > 3) {
- (void) printf("spa_next: found %s\n", spa_name(spa));
- }
- }
- mutex_exit(&spa_namespace_lock);
+ list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+ offsetof(ztest_cb_data_t, zcd_node));
/*
* Open our pool.
*/
- VERIFY(spa_open(pool, &spa, FTAG) == 0);
+ kernel_init(FREAD | FWRITE);
+ VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+ zs->zs_spa = spa;
+
+ spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
/*
* We don't expect the pool to suspend unless maxfaults == 0,
* in which case ztest_fault_inject() temporarily takes away
* the only valid replica.
*/
- if (zopt_maxfaults == 0)
+ if (MAXFAULTS() == 0)
spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
else
spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
@@ -3688,13 +5158,19 @@ ztest_run(char *pool)
&resume_tid) == 0);
/*
+ * Create a deadman thread to abort() if we hang.
+ */
+ VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
+ NULL) == 0);
+
+ /*
* Verify that we can safely inquire about about any object,
* whether it's allocated or not. To make it interesting,
* we probe a 5-wide window around each power of two.
* This hits all edge cases, including zero and the max.
*/
- for (t = 0; t < 64; t++) {
- for (d = -5; d <= 5; d++) {
+ for (int t = 0; t < 64; t++) {
+ for (int d = -5; d <= 5; d++) {
error = dmu_object_info(spa->spa_meta_objset,
(1ULL << t) + d, NULL);
ASSERT(error == 0 || error == ENOENT ||
@@ -3703,101 +5179,45 @@ ztest_run(char *pool)
}
/*
- * Now kick off all the tests that run in parallel.
+ * If we got any ENOSPC errors on the previous run, destroy something.
*/
+ if (zs->zs_enospc_count != 0) {
+ int d = ztest_random(zopt_datasets);
+ ztest_dataset_destroy(zs, d);
+ }
zs->zs_enospc_count = 0;
- za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+ tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
if (zopt_verbose >= 4)
(void) printf("starting main threads...\n");
- za[0].za_start = gethrtime();
- za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
- za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
- za[0].za_kill = za[0].za_stop;
- if (ztest_random(100) < zopt_killrate)
- za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
- for (t = 0; t < zopt_threads; t++) {
- d = t % zopt_datasets;
-
- (void) strcpy(za[t].za_pool, pool);
- za[t].za_os = za[d].za_os;
- za[t].za_spa = spa;
- za[t].za_zilog = za[d].za_zilog;
- za[t].za_instance = t;
- za[t].za_random = ztest_random(-1ULL);
- za[t].za_start = za[0].za_start;
- za[t].za_stop = za[0].za_stop;
- za[t].za_kill = za[0].za_kill;
-
- if (t < zopt_datasets) {
- int test_future = FALSE;
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
- ztest_create_cb, NULL);
- if (error == EEXIST) {
- test_future = TRUE;
- } else if (error == ENOSPC) {
- zs->zs_enospc_count++;
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- break;
- } else if (error != 0) {
- fatal(0, "dmu_objset_create(%s) = %d",
- name, error);
- }
- error = dmu_objset_open(name, DMU_OST_OTHER,
- DS_MODE_USER, &za[d].za_os);
- if (error)
- fatal(0, "dmu_objset_open('%s') = %d",
- name, error);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- if (test_future)
- ztest_dmu_check_future_leak(&za[t]);
- zil_replay(za[d].za_os, za[d].za_os,
- ztest_replay_vector);
- za[d].za_zilog = zil_open(za[d].za_os, NULL);
- }
-
- VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
- &za[t].za_thread) == 0);
- }
-
- while (--t >= 0) {
- VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
- if (t < zopt_datasets) {
- zil_close(za[t].za_zilog);
- dmu_objset_close(za[t].za_os);
- }
+ /*
+ * Kick off all the tests that run in parallel.
+ */
+ for (int t = 0; t < zopt_threads; t++) {
+ if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+ return;
+ VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+ THR_BOUND, &tid[t]) == 0);
}
- if (zopt_verbose >= 3)
- show_pool_stats(spa);
-
- txg_wait_synced(spa_get_dsl(spa), 0);
-
- zs->zs_alloc = spa_get_alloc(spa);
- zs->zs_space = spa_get_space(spa);
-
/*
- * If we had out-of-space errors, destroy a random objset.
+ * Wait for all of the tests to complete. We go in reverse order
+ * so we don't close datasets while threads are still using them.
*/
- if (zs->zs_enospc_count != 0) {
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- d = (int)ztest_random(zopt_datasets);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- if (zopt_verbose >= 3)
- (void) printf("Destroying %s to free up space\n", name);
- (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
- DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ for (int t = zopt_threads - 1; t >= 0; t--) {
+ VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+ if (t < zopt_datasets)
+ ztest_dataset_close(zs, t);
}
txg_wait_synced(spa_get_dsl(spa), 0);
- umem_free(za, zopt_threads * sizeof (ztest_args_t));
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(tid, zopt_threads * sizeof (thread_t));
/* Kill the resume thread */
ztest_exiting = B_TRUE;
@@ -3808,11 +5228,107 @@ ztest_run(char *pool)
* Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete.
*/
- for (t = 1; t < 50; t++)
- dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+ for (uint64_t object = 1; object < 50; object++)
+ dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
spa_close(spa, FTAG);
+ /*
+ * Verify that we can loop over all pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+ if (zopt_verbose > 3)
+ (void) printf("spa_next: found %s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Verify that we can export the pool and reimport it under a
+ * different name.
+ */
+ if (ztest_random(2) == 0) {
+ char name[MAXNAMELEN];
+ (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+ ztest_spa_import_export(zs->zs_pool, name);
+ ztest_spa_import_export(name, zs->zs_pool);
+ }
+
+ kernel_fini();
+
+ list_destroy(&zcl.zcl_callbacks);
+
+ (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+
+ (void) rwlock_destroy(&zs->zs_name_lock);
+ (void) _mutex_destroy(&zs->zs_vdev_lock);
+}
+
+static void
+ztest_freeze(ztest_shared_t *zs)
+{
+ ztest_ds_t *zd = &zs->zs_zd[0];
+ spa_t *spa;
+ int numloops = 0;
+
+ if (zopt_verbose >= 3)
+ (void) printf("testing spa_freeze()...\n");
+
+ kernel_init(FREAD | FWRITE);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+
+ /*
+ * Force the first log block to be transactionally allocated.
+ * We have to do this before we freeze the pool -- otherwise
+ * the log chain won't be anchored.
+ */
+ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+ ztest_dmu_object_alloc_free(zd, 0);
+ zil_commit(zd->zd_zilog, 0);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Freeze the pool. This stops spa_sync() from doing anything,
+ * so that the only way to record changes from now on is the ZIL.
+ */
+ spa_freeze(spa);
+
+ /*
+ * Run tests that generate log records but don't alter the pool config
+ * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+ * We do a txg_wait_synced() after each iteration to force the txg
+ * to increase well beyond the last synced value in the uberblock.
+ * The ZIL should be OK with that.
+ */
+ while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) {
+ ztest_dmu_write_parallel(zd, 0);
+ ztest_dmu_object_alloc_free(zd, 0);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ /*
+ * Commit all of the changes we just generated.
+ */
+ zil_commit(zd->zd_zilog, 0);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Close our dataset and close the pool.
+ */
+ ztest_dataset_close(zs, 0);
+ spa_close(spa, FTAG);
+ kernel_fini();
+
+ /*
+ * Open and close the pool and dataset to induce log replay.
+ */
+ kernel_init(FREAD | FWRITE);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+ ztest_dataset_close(zs, 0);
+ spa_close(spa, FTAG);
kernel_fini();
}
@@ -3841,43 +5357,65 @@ print_time(hrtime_t t, char *timebuf)
(void) sprintf(timebuf, "%llus", s);
}
+static nvlist_t *
+make_random_props()
+{
+ nvlist_t *props;
+
+ if (ztest_random(2) == 0)
+ return (NULL);
+
+ VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
+
+ (void) printf("props:\n");
+ dump_nvlist(props, 4);
+
+ return (props);
+}
+
/*
* Create a storage pool with the given name and initial vdev size.
- * Then create the specified number of datasets in the pool.
+ * Then test spa_freeze() functionality.
*/
static void
-ztest_init(char *pool)
+ztest_init(ztest_shared_t *zs)
{
spa_t *spa;
- int error;
- nvlist_t *nvroot;
+ nvlist_t *nvroot, *props;
+
+ VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
kernel_init(FREAD | FWRITE);
/*
* Create the storage pool.
*/
- (void) spa_destroy(pool);
- ztest_shared->zs_vdev_primaries = 0;
+ (void) spa_destroy(zs->zs_pool);
+ ztest_shared->zs_vdev_next_leaf = 0;
+ zs->zs_splits = 0;
+ zs->zs_mirrors = zopt_mirrors;
nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
- 0, zopt_raidz, zopt_mirrors, 1);
- error = spa_create(pool, nvroot, NULL, NULL, NULL);
+ 0, zopt_raidz, zs->zs_mirrors, 1);
+ props = make_random_props();
+ VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL));
nvlist_free(nvroot);
- if (error)
- fatal(0, "spa_create() = %d", error);
- error = spa_open(pool, &spa, FTAG);
- if (error)
- fatal(0, "spa_open() = %d", error);
-
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
-
- if (zopt_verbose >= 3)
- show_pool_stats(spa);
-
spa_close(spa, FTAG);
kernel_fini();
+
+ ztest_run_zdb(zs->zs_pool);
+
+ ztest_freeze(zs);
+
+ ztest_run_zdb(zs->zs_pool);
+
+ (void) rwlock_destroy(&zs->zs_name_lock);
+ (void) _mutex_destroy(&zs->zs_vdev_lock);
}
int
@@ -3885,29 +5423,32 @@ main(int argc, char **argv)
{
int kills = 0;
int iters = 0;
- int i, f;
ztest_shared_t *zs;
+ size_t shared_size;
ztest_info_t *zi;
char timebuf[100];
char numbuf[6];
+ spa_t *spa;
(void) setvbuf(stdout, NULL, _IOLBF, 0);
- /* Override location of zpool.cache */
- spa_config_path = "/tmp/zpool.cache";
-
ztest_random_fd = open("/dev/urandom", O_RDONLY);
process_options(argc, argv);
+ /* Override location of zpool.cache */
+ (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", zopt_dir);
+
/*
* Blow away any existing copy of zpool.cache
*/
if (zopt_init != 0)
- (void) remove("/tmp/zpool.cache");
+ (void) remove(spa_config_path);
+
+ shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
zs = ztest_shared = (void *)mmap(0,
- P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+ P2ROUNDUP(shared_size, getpagesize()),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
if (zopt_verbose >= 1) {
@@ -3920,46 +5461,43 @@ main(int argc, char **argv)
/*
* Create and initialize our storage pool.
*/
- for (i = 1; i <= zopt_init; i++) {
+ for (int i = 1; i <= zopt_init; i++) {
bzero(zs, sizeof (ztest_shared_t));
if (zopt_verbose >= 3 && zopt_init != 1)
(void) printf("ztest_init(), pass %d\n", i);
- ztest_init(zopt_pool);
+ zs->zs_pool = zopt_pool;
+ ztest_init(zs);
}
- /*
- * Initialize the call targets for each function.
- */
- for (f = 0; f < ZTEST_FUNCS; f++) {
- zi = &zs->zs_info[f];
+ zs->zs_pool = zopt_pool;
+ zs->zs_proc_start = gethrtime();
+ zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
+ zi = &zs->zs_info[f];
*zi = ztest_info[f];
-
- if (*zi->zi_interval == 0)
- zi->zi_call_target = UINT64_MAX;
+ if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+ zi->zi_call_next = UINT64_MAX;
else
- zi->zi_call_target = zopt_time / *zi->zi_interval;
+ zi->zi_call_next = zs->zs_proc_start +
+ ztest_random(2 * zi->zi_interval[0] + 1);
}
- zs->zs_start_time = gethrtime();
- zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
/*
* Run the tests in a loop. These tests include fault injection
* to verify that self-healing data works, and forced crashes
* to verify that we never lose on-disk consistency.
*/
- while (gethrtime() < zs->zs_stop_time) {
+ while (gethrtime() < zs->zs_proc_stop) {
int status;
pid_t pid;
- char *tmp;
/*
* Initialize the workload counters for each function.
*/
- for (f = 0; f < ZTEST_FUNCS; f++) {
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
zi = &zs->zs_info[f];
- zi->zi_calls = 0;
+ zi->zi_call_count = 0;
zi->zi_call_time = 0;
}
@@ -3975,7 +5513,7 @@ main(int argc, char **argv)
struct rlimit rl = { 1024, 1024 };
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
- ztest_run(zopt_pool);
+ ztest_run(zs);
exit(0);
}
@@ -4008,8 +5546,8 @@ main(int argc, char **argv)
if (zopt_verbose >= 1) {
hrtime_t now = gethrtime();
- now = MIN(now, zs->zs_stop_time);
- print_time(zs->zs_stop_time - now, timebuf);
+ now = MIN(now, zs->zs_proc_stop);
+ print_time(zs->zs_proc_stop - now, timebuf);
nicenum(zs->zs_space, numbuf);
(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
@@ -4019,7 +5557,7 @@ main(int argc, char **argv)
(u_longlong_t)zs->zs_enospc_count,
100.0 * zs->zs_alloc / zs->zs_space,
numbuf,
- 100.0 * (now - zs->zs_start_time) /
+ 100.0 * (now - zs->zs_proc_start) /
(zopt_time * NANOSEC), timebuf);
}
@@ -4029,34 +5567,39 @@ main(int argc, char **argv)
"Calls", "Time", "Function");
(void) printf("%7s %9s %s\n",
"-----", "----", "--------");
- for (f = 0; f < ZTEST_FUNCS; f++) {
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
Dl_info dli;
zi = &zs->zs_info[f];
print_time(zi->zi_call_time, timebuf);
(void) dladdr((void *)zi->zi_func, &dli);
(void) printf("%7llu %9s %s\n",
- (u_longlong_t)zi->zi_calls, timebuf,
+ (u_longlong_t)zi->zi_call_count, timebuf,
dli.dli_sname);
}
(void) printf("\n");
}
/*
- * It's possible that we killed a child during a rename test, in
- * which case we'll have a 'ztest_tmp' pool lying around instead
- * of 'ztest'. Do a blind rename in case this happened.
+ * It's possible that we killed a child during a rename test,
+ * in which case we'll have a 'ztest_tmp' pool lying around
+ * instead of 'ztest'. Do a blind rename in case this happened.
*/
- tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
- (void) strcpy(tmp, zopt_pool);
- (void) strcat(tmp, "_tmp");
- kernel_init(FREAD | FWRITE);
- (void) spa_rename(tmp, zopt_pool);
+ kernel_init(FREAD);
+ if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+ spa_close(spa, FTAG);
+ } else {
+ char tmpname[MAXNAMELEN];
+ kernel_fini();
+ kernel_init(FREAD | FWRITE);
+ (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+ zopt_pool);
+ (void) spa_rename(tmpname, zopt_pool);
+ }
kernel_fini();
- umem_free(tmp, strlen(tmp) + 1);
- }
- ztest_verify_blocks(zopt_pool);
+ ztest_run_zdb(zopt_pool);
+ }
if (zopt_verbose >= 1) {
(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
diff --git a/cddl/contrib/opensolaris/head/synch.h b/cddl/contrib/opensolaris/head/synch.h
index eab9de86a534..89efe9c687ac 100644
--- a/cddl/contrib/opensolaris/head/synch.h
+++ b/cddl/contrib/opensolaris/head/synch.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYNCH_H
#define _SYNCH_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* synch.h:
* definitions needed to use the thread synchronization interface
@@ -243,10 +240,17 @@ int sema_trywait();
#ifdef __STDC__
-int _sema_held(sema_t *);
-int _rw_read_held(rwlock_t *);
-int _rw_write_held(rwlock_t *);
-int _mutex_held(mutex_t *);
+/*
+ * The *_held() functions apply equally well to Solaris threads
+ * and to Posix threads synchronization objects, but the formal
+ * type declarations are different, so we just declare the argument
+ * to each *_held() function to be a void *, expecting that they will
+ * be called with the proper type of argument in each case.
+ */
+int _sema_held(void *); /* sema_t or sem_t */
+int _rw_read_held(void *); /* rwlock_t or pthread_rwlock_t */
+int _rw_write_held(void *); /* rwlock_t or pthread_rwlock_t */
+int _mutex_held(void *); /* mutex_t or pthread_mutex_t */
#else /* __STDC__ */
@@ -257,6 +261,13 @@ int _mutex_held();
#endif /* __STDC__ */
+/* Pause API */
+#ifdef __STDC__
+void smt_pause(void);
+#else /* __STDC__ */
+void smt_pause();
+#endif /* __STDC__ */
+
#endif /* _ASM */
#ifdef __cplusplus
diff --git a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
index 89e01dd894f6..14257487361d 100644
--- a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
+++ b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c
@@ -19,15 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <solaris.h>
#include <inttypes.h>
#include <unistd.h>
#include <strings.h>
+#include <libintl.h>
+#include <stdarg.h>
#include "libnvpair.h"
/*
@@ -38,21 +38,531 @@
* between kernel and userland, and possibly saving onto disk files.
*/
+/*
+ * Print control structure.
+ */
+
+#define DEFINEOP(opname, vtype) \
+ struct { \
+ int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+ const char *, vtype); \
+ void *arg; \
+ } opname
+
+#define DEFINEARROP(opname, vtype) \
+ struct { \
+ int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+ const char *, vtype, uint_t); \
+ void *arg; \
+ } opname
+
+struct nvlist_printops {
+ DEFINEOP(print_boolean, int);
+ DEFINEOP(print_boolean_value, boolean_t);
+ DEFINEOP(print_byte, uchar_t);
+ DEFINEOP(print_int8, int8_t);
+ DEFINEOP(print_uint8, uint8_t);
+ DEFINEOP(print_int16, int16_t);
+ DEFINEOP(print_uint16, uint16_t);
+ DEFINEOP(print_int32, int32_t);
+ DEFINEOP(print_uint32, uint32_t);
+ DEFINEOP(print_int64, int64_t);
+ DEFINEOP(print_uint64, uint64_t);
+ DEFINEOP(print_double, double);
+ DEFINEOP(print_string, char *);
+ DEFINEOP(print_hrtime, hrtime_t);
+ DEFINEOP(print_nvlist, nvlist_t *);
+ DEFINEARROP(print_boolean_array, boolean_t *);
+ DEFINEARROP(print_byte_array, uchar_t *);
+ DEFINEARROP(print_int8_array, int8_t *);
+ DEFINEARROP(print_uint8_array, uint8_t *);
+ DEFINEARROP(print_int16_array, int16_t *);
+ DEFINEARROP(print_uint16_array, uint16_t *);
+ DEFINEARROP(print_int32_array, int32_t *);
+ DEFINEARROP(print_uint32_array, uint32_t *);
+ DEFINEARROP(print_int64_array, int64_t *);
+ DEFINEARROP(print_uint64_array, uint64_t *);
+ DEFINEARROP(print_string_array, char **);
+ DEFINEARROP(print_nvlist_array, nvlist_t **);
+};
+
+struct nvlist_prtctl {
+ FILE *nvprt_fp; /* output destination */
+ enum nvlist_indent_mode nvprt_indent_mode; /* see above */
+ int nvprt_indent; /* absolute indent, or tab depth */
+ int nvprt_indentinc; /* indent or tab increment */
+ const char *nvprt_nmfmt; /* member name format, max one %s */
+ const char *nvprt_eomfmt; /* after member format, e.g. "\n" */
+ const char *nvprt_btwnarrfmt; /* between array members */
+ int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */
+ struct nvlist_printops *nvprt_dfltops;
+ struct nvlist_printops *nvprt_custops;
+};
+
+#define DFLTPRTOP(pctl, type) \
+ ((pctl)->nvprt_dfltops->print_##type.op)
+
+#define DFLTPRTOPARG(pctl, type) \
+ ((pctl)->nvprt_dfltops->print_##type.arg)
+
+#define CUSTPRTOP(pctl, type) \
+ ((pctl)->nvprt_custops->print_##type.op)
+
+#define CUSTPRTOPARG(pctl, type) \
+ ((pctl)->nvprt_custops->print_##type.arg)
+
+#define RENDER(pctl, type, nvl, name, val) \
+ { \
+ int done = 0; \
+ if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+ done = CUSTPRTOP(pctl, type)(pctl, \
+ CUSTPRTOPARG(pctl, type), nvl, name, val); \
+ } \
+ if (!done) { \
+ (void) DFLTPRTOP(pctl, type)(pctl, \
+ DFLTPRTOPARG(pctl, type), nvl, name, val); \
+ } \
+ (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+ }
+
+#define ARENDER(pctl, type, nvl, name, arrp, count) \
+ { \
+ int done = 0; \
+ if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+ done = CUSTPRTOP(pctl, type)(pctl, \
+ CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+ } \
+ if (!done) { \
+ (void) DFLTPRTOP(pctl, type)(pctl, \
+ DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+ } \
+ (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+ }
+
+static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t);
+
+/*
+ * ======================================================================
+ * | |
+ * | Indentation |
+ * | |
+ * ======================================================================
+ */
+
static void
-indent(FILE *fp, int depth)
+indent(nvlist_prtctl_t pctl, int onemore)
{
- while (depth-- > 0)
- (void) fprintf(fp, "\t");
+ int depth;
+
+ switch (pctl->nvprt_indent_mode) {
+ case NVLIST_INDENT_ABS:
+ (void) fprintf(pctl->nvprt_fp, "%*s",
+ pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, "");
+ break;
+
+ case NVLIST_INDENT_TABBED:
+ depth = pctl->nvprt_indent + onemore;
+ while (depth-- > 0)
+ (void) fprintf(pctl->nvprt_fp, "\t");
+ }
}
/*
- * nvlist_print - Prints elements in an event buffer
+ * ======================================================================
+ * | |
+ * | Default nvlist member rendering functions. |
+ * | |
+ * ======================================================================
+ */
+
+/*
+ * Generate functions to print single-valued nvlist members.
+ *
+ * type_and_variant - suffix to form function name
+ * vtype - C type for the member value
+ * ptype - C type to cast value to for printing
+ * vfmt - format string for pair value, e.g "%d" or "0x%llx"
+ */
+
+#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+ nvlist_t *nvl, const char *name, vtype value) \
+{ \
+ FILE *fp = pctl->nvprt_fp; \
+ NOTE(ARGUNUSED(private)) \
+ NOTE(ARGUNUSED(nvl)) \
+ indent(pctl, 1); \
+ (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+ (void) fprintf(fp, vfmt, (ptype)value); \
+ return (1); \
+}
+
+NVLIST_PRTFUNC(boolean, int, int, "%d")
+NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d")
+NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_PRTFUNC(int8, int8_t, int, "%d")
+NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x")
+NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d")
+NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x")
+NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d")
+NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x")
+NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld")
+NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_PRTFUNC(double, double, double, "0x%llf")
+NVLIST_PRTFUNC(string, char *, char *, "%s")
+NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx")
+
+/*
+ * Generate functions to print array-valued nvlist members.
+ */
+
+#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+ nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \
+{ \
+ FILE *fp = pctl->nvprt_fp; \
+ uint_t i; \
+ NOTE(ARGUNUSED(private)) \
+ NOTE(ARGUNUSED(nvl)) \
+ for (i = 0; i < count; i++) { \
+ if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \
+ indent(pctl, 1); \
+ (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+ if (pctl->nvprt_btwnarrfmt_nl) \
+ (void) fprintf(fp, "[%d]: ", i); \
+ } \
+ if (i != 0) \
+ (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+ (void) fprintf(fp, vfmt, (ptype)valuep[i]); \
+ } \
+ return (1); \
+}
+
+NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d")
+NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d")
+NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x")
+NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d")
+NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x")
+NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d")
+NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x")
+NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld")
+NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s")
+
+/*ARGSUSED*/
+static int
+nvprint_nvlist(nvlist_prtctl_t pctl, void *private,
+ nvlist_t *nvl, const char *name, nvlist_t *value)
+{
+ FILE *fp = pctl->nvprt_fp;
+
+ indent(pctl, 1);
+ (void) fprintf(fp, "%s = (embedded nvlist)\n", name);
+
+ pctl->nvprt_indent += pctl->nvprt_indentinc;
+ nvlist_print_with_indent(value, pctl);
+ pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+ indent(pctl, 1);
+ (void) fprintf(fp, "(end %s)\n", name);
+
+ return (1);
+}
+
+/*ARGSUSED*/
+static int
+nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private,
+ nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count)
+{
+ FILE *fp = pctl->nvprt_fp;
+ uint_t i;
+
+ indent(pctl, 1);
+ (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name);
+
+ for (i = 0; i < count; i++) {
+ indent(pctl, 1);
+ (void) fprintf(fp, "(start %s[%d])\n", name, i);
+
+ pctl->nvprt_indent += pctl->nvprt_indentinc;
+ nvlist_print_with_indent(valuep[i], pctl);
+ pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+ indent(pctl, 1);
+ (void) fprintf(fp, "(end %s[%d])\n", name, i);
+ }
+
+ return (1);
+}
+
+/*
+ * ======================================================================
+ * | |
+ * | Interfaces that allow control over formatting. |
+ * | |
+ * ======================================================================
+ */
+
+void
+nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp)
+{
+ pctl->nvprt_fp = fp;
+}
+
+FILE *
+nvlist_prtctl_getdest(nvlist_prtctl_t pctl)
+{
+ return (pctl->nvprt_fp);
+}
+
+
+void
+nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode,
+ int start, int inc)
+{
+ if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED)
+ mode = NVLIST_INDENT_TABBED;
+
+ if (start < 0)
+ start = 0;
+
+ if (inc < 0)
+ inc = 1;
+
+ pctl->nvprt_indent_mode = mode;
+ pctl->nvprt_indent = start;
+ pctl->nvprt_indentinc = inc;
+}
+
+void
+nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore)
+{
+ indent(pctl, onemore);
+}
+
+
+void
+nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which,
+ const char *fmt)
+{
+ switch (which) {
+ case NVLIST_FMT_MEMBER_NAME:
+ if (fmt == NULL)
+ fmt = "%s = ";
+ pctl->nvprt_nmfmt = fmt;
+ break;
+
+ case NVLIST_FMT_MEMBER_POSTAMBLE:
+ if (fmt == NULL)
+ fmt = "\n";
+ pctl->nvprt_eomfmt = fmt;
+ break;
+
+ case NVLIST_FMT_BTWN_ARRAY:
+ if (fmt == NULL) {
+ pctl->nvprt_btwnarrfmt = " ";
+ pctl->nvprt_btwnarrfmt_nl = 0;
+ } else {
+ pctl->nvprt_btwnarrfmt = fmt;
+ pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL);
+ }
+ break;
+
+ default:
+ break;
+ }
+}
+
+
+void
+nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...)
+{
+ FILE *fp = pctl->nvprt_fp;
+ va_list ap;
+ char *name;
+
+ va_start(ap, which);
+
+ switch (which) {
+ case NVLIST_FMT_MEMBER_NAME:
+ name = va_arg(ap, char *);
+ (void) fprintf(fp, pctl->nvprt_nmfmt, name);
+ break;
+
+ case NVLIST_FMT_MEMBER_POSTAMBLE:
+ (void) fprintf(fp, pctl->nvprt_eomfmt);
+ break;
+
+ case NVLIST_FMT_BTWN_ARRAY:
+ (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+ break;
+
+ default:
+ break;
+ }
+
+ va_end(ap);
+}
+
+/*
+ * ======================================================================
+ * | |
+ * | Interfaces to allow appointment of replacement rendering functions.|
+ * | |
+ * ======================================================================
+ */
+
+#define NVLIST_PRINTCTL_REPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+ int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \
+ void *private) \
+{ \
+ CUSTPRTOP(pctl, type) = func; \
+ CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_REPLACE(boolean, int)
+NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t)
+NVLIST_PRINTCTL_REPLACE(byte, uchar_t)
+NVLIST_PRINTCTL_REPLACE(int8, int8_t)
+NVLIST_PRINTCTL_REPLACE(uint8, uint8_t)
+NVLIST_PRINTCTL_REPLACE(int16, int16_t)
+NVLIST_PRINTCTL_REPLACE(uint16, uint16_t)
+NVLIST_PRINTCTL_REPLACE(int32, int32_t)
+NVLIST_PRINTCTL_REPLACE(uint32, uint32_t)
+NVLIST_PRINTCTL_REPLACE(int64, int64_t)
+NVLIST_PRINTCTL_REPLACE(uint64, uint64_t)
+NVLIST_PRINTCTL_REPLACE(double, double)
+NVLIST_PRINTCTL_REPLACE(string, char *)
+NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t)
+NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *)
+
+#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+ int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \
+ uint_t), void *private) \
+{ \
+ CUSTPRTOP(pctl, type) = func; \
+ CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *)
+NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *)
+NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *)
+NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *)
+NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *)
+NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *)
+NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *)
+NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *)
+NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *)
+NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *)
+NVLIST_PRINTCTL_AREPLACE(string_array, char **)
+NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **)
+
+/*
+ * ======================================================================
+ * | |
+ * | Interfaces to manage nvlist_prtctl_t cookies. |
+ * | |
+ * ======================================================================
*/
-static
+
+
+static const struct nvlist_printops defprtops = {
+ { nvprint_boolean, NULL },
+ { nvprint_boolean_value, NULL },
+ { nvprint_byte, NULL },
+ { nvprint_int8, NULL },
+ { nvprint_uint8, NULL },
+ { nvprint_int16, NULL },
+ { nvprint_uint16, NULL },
+ { nvprint_int32, NULL },
+ { nvprint_uint32, NULL },
+ { nvprint_int64, NULL },
+ { nvprint_uint64, NULL },
+ { nvprint_double, NULL },
+ { nvprint_string, NULL },
+ { nvprint_hrtime, NULL },
+ { nvprint_nvlist, NULL },
+ { nvaprint_boolean_array, NULL },
+ { nvaprint_byte_array, NULL },
+ { nvaprint_int8_array, NULL },
+ { nvaprint_uint8_array, NULL },
+ { nvaprint_int16_array, NULL },
+ { nvaprint_uint16_array, NULL },
+ { nvaprint_int32_array, NULL },
+ { nvaprint_uint32_array, NULL },
+ { nvaprint_int64_array, NULL },
+ { nvaprint_uint64_array, NULL },
+ { nvaprint_string_array, NULL },
+ { nvaprint_nvlist_array, NULL },
+};
+
+static void
+prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl,
+ struct nvlist_printops *ops)
+{
+ pctl->nvprt_fp = fp;
+ pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED;
+ pctl->nvprt_indent = 0;
+ pctl->nvprt_indentinc = 1;
+ pctl->nvprt_nmfmt = "%s = ";
+ pctl->nvprt_eomfmt = "\n";
+ pctl->nvprt_btwnarrfmt = " ";
+ pctl->nvprt_btwnarrfmt_nl = 0;
+
+ pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops;
+ pctl->nvprt_custops = ops;
+}
+
+nvlist_prtctl_t
+nvlist_prtctl_alloc(void)
+{
+ struct nvlist_prtctl *pctl;
+ struct nvlist_printops *ops;
+
+ if ((pctl = malloc(sizeof (*pctl))) == NULL)
+ return (NULL);
+
+ if ((ops = calloc(1, sizeof (*ops))) == NULL) {
+ free(pctl);
+ return (NULL);
+ }
+
+ prtctl_defaults(stdout, pctl, ops);
+
+ return (pctl);
+}
+
void
-nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
+nvlist_prtctl_free(nvlist_prtctl_t pctl)
+{
+ if (pctl != NULL) {
+ free(pctl->nvprt_custops);
+ free(pctl);
+ }
+}
+
+/*
+ * ======================================================================
+ * | |
+ * | Top-level print request interfaces. |
+ * | |
+ * ======================================================================
+ */
+
+/*
+ * nvlist_print - Prints elements in an event buffer
+ */
+static void
+nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl)
{
- int i;
+ FILE *fp = pctl->nvprt_fp;
char *name;
uint_t nelem;
nvpair_t *nvp;
@@ -60,7 +570,7 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
if (nvl == NULL)
return;
- indent(fp, depth);
+ indent(pctl, 0);
(void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl));
nvp = nvlist_next_nvpair(nvl, NULL);
@@ -68,199 +578,174 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
while (nvp) {
data_type_t type = nvpair_type(nvp);
- indent(fp, depth);
name = nvpair_name(nvp);
- (void) fprintf(fp, "\t%s =", name);
nelem = 0;
+
switch (type) {
case DATA_TYPE_BOOLEAN: {
- (void) fprintf(fp, " 1");
+ RENDER(pctl, boolean, nvl, name, 1);
break;
}
case DATA_TYPE_BOOLEAN_VALUE: {
boolean_t val;
(void) nvpair_value_boolean_value(nvp, &val);
- (void) fprintf(fp, " %d", val);
+ RENDER(pctl, boolean_value, nvl, name, val);
break;
}
case DATA_TYPE_BYTE: {
uchar_t val;
(void) nvpair_value_byte(nvp, &val);
- (void) fprintf(fp, " 0x%2.2x", val);
+ RENDER(pctl, byte, nvl, name, val);
break;
}
case DATA_TYPE_INT8: {
int8_t val;
(void) nvpair_value_int8(nvp, &val);
- (void) fprintf(fp, " %d", val);
+ RENDER(pctl, int8, nvl, name, val);
break;
}
case DATA_TYPE_UINT8: {
uint8_t val;
(void) nvpair_value_uint8(nvp, &val);
- (void) fprintf(fp, " 0x%x", val);
+ RENDER(pctl, uint8, nvl, name, val);
break;
}
case DATA_TYPE_INT16: {
int16_t val;
(void) nvpair_value_int16(nvp, &val);
- (void) fprintf(fp, " %d", val);
+ RENDER(pctl, int16, nvl, name, val);
break;
}
case DATA_TYPE_UINT16: {
uint16_t val;
(void) nvpair_value_uint16(nvp, &val);
- (void) fprintf(fp, " 0x%x", val);
+ RENDER(pctl, uint16, nvl, name, val);
break;
}
case DATA_TYPE_INT32: {
int32_t val;
(void) nvpair_value_int32(nvp, &val);
- (void) fprintf(fp, " %d", val);
+ RENDER(pctl, int32, nvl, name, val);
break;
}
case DATA_TYPE_UINT32: {
uint32_t val;
(void) nvpair_value_uint32(nvp, &val);
- (void) fprintf(fp, " 0x%x", val);
+ RENDER(pctl, uint32, nvl, name, val);
break;
}
case DATA_TYPE_INT64: {
int64_t val;
(void) nvpair_value_int64(nvp, &val);
- (void) fprintf(fp, " %lld", (longlong_t)val);
+ RENDER(pctl, int64, nvl, name, val);
break;
}
case DATA_TYPE_UINT64: {
uint64_t val;
(void) nvpair_value_uint64(nvp, &val);
- (void) fprintf(fp, " 0x%llx", (u_longlong_t)val);
+ RENDER(pctl, uint64, nvl, name, val);
break;
}
case DATA_TYPE_DOUBLE: {
double val;
(void) nvpair_value_double(nvp, &val);
- (void) fprintf(fp, " 0x%llf", val);
+ RENDER(pctl, double, nvl, name, val);
break;
}
case DATA_TYPE_STRING: {
char *val;
(void) nvpair_value_string(nvp, &val);
- (void) fprintf(fp, " %s", val);
+ RENDER(pctl, string, nvl, name, val);
break;
}
case DATA_TYPE_BOOLEAN_ARRAY: {
boolean_t *val;
(void) nvpair_value_boolean_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %d", val[i]);
+ ARENDER(pctl, boolean_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_BYTE_ARRAY: {
uchar_t *val;
(void) nvpair_value_byte_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " 0x%2.2x", val[i]);
+ ARENDER(pctl, byte_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_INT8_ARRAY: {
int8_t *val;
(void) nvpair_value_int8_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %d", val[i]);
+ ARENDER(pctl, int8_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_UINT8_ARRAY: {
uint8_t *val;
(void) nvpair_value_uint8_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " 0x%x", val[i]);
+ ARENDER(pctl, uint8_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_INT16_ARRAY: {
int16_t *val;
(void) nvpair_value_int16_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %d", val[i]);
+ ARENDER(pctl, int16_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_UINT16_ARRAY: {
uint16_t *val;
(void) nvpair_value_uint16_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " 0x%x", val[i]);
+ ARENDER(pctl, uint16_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_INT32_ARRAY: {
int32_t *val;
(void) nvpair_value_int32_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %d", val[i]);
+ ARENDER(pctl, int32_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_UINT32_ARRAY: {
uint32_t *val;
(void) nvpair_value_uint32_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " 0x%x", val[i]);
+ ARENDER(pctl, uint32_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_INT64_ARRAY: {
int64_t *val;
(void) nvpair_value_int64_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %lld", (longlong_t)val[i]);
+ ARENDER(pctl, int64_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_UINT64_ARRAY: {
uint64_t *val;
(void) nvpair_value_uint64_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " 0x%llx",
- (u_longlong_t)val[i]);
+ ARENDER(pctl, uint64_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_STRING_ARRAY: {
char **val;
(void) nvpair_value_string_array(nvp, &val, &nelem);
- for (i = 0; i < nelem; i++)
- (void) fprintf(fp, " %s", val[i]);
+ ARENDER(pctl, string_array, nvl, name, val, nelem);
break;
}
case DATA_TYPE_HRTIME: {
hrtime_t val;
(void) nvpair_value_hrtime(nvp, &val);
- (void) fprintf(fp, " 0x%llx", val);
+ RENDER(pctl, hrtime, nvl, name, val);
break;
}
case DATA_TYPE_NVLIST: {
nvlist_t *val;
(void) nvpair_value_nvlist(nvp, &val);
- (void) fprintf(fp, " (embedded nvlist)\n");
- nvlist_print_with_indent(fp, val, depth + 1);
- indent(fp, depth + 1);
- (void) fprintf(fp, "(end %s)\n", name);
+ RENDER(pctl, nvlist, nvl, name, val);
break;
}
case DATA_TYPE_NVLIST_ARRAY: {
nvlist_t **val;
(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
- (void) fprintf(fp, " (array of embedded nvlists)\n");
- for (i = 0; i < nelem; i++) {
- indent(fp, depth + 1);
- (void) fprintf(fp,
- "(start %s[%d])\n", name, i);
- nvlist_print_with_indent(fp, val[i], depth + 1);
- indent(fp, depth + 1);
- (void) fprintf(fp, "(end %s[%d])\n", name, i);
- }
+ ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
break;
}
default:
(void) fprintf(fp, " unknown data type (%d)", type);
break;
}
- (void) fprintf(fp, "\n");
nvp = nvlist_next_nvpair(nvl, nvp);
}
}
@@ -268,9 +753,175 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
void
nvlist_print(FILE *fp, nvlist_t *nvl)
{
- nvlist_print_with_indent(fp, nvl, 0);
+ struct nvlist_prtctl pc;
+
+ prtctl_defaults(fp, &pc, NULL);
+ nvlist_print_with_indent(nvl, &pc);
+}
+
+void
+nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl)
+{
+ nvlist_print_with_indent(nvl, pctl);
+}
+
+#define NVP(elem, type, vtype, ptype, format) { \
+ vtype value; \
+\
+ (void) nvpair_value_##type(elem, &value); \
+ (void) printf("%*s%s: " format "\n", indent, "", \
+ nvpair_name(elem), (ptype)value); \
}
+#define NVPA(elem, type, vtype, ptype, format) { \
+ uint_t i, count; \
+ vtype *value; \
+\
+ (void) nvpair_value_##type(elem, &value, &count); \
+ for (i = 0; i < count; i++) { \
+ (void) printf("%*s%s[%d]: " format "\n", indent, "", \
+ nvpair_name(elem), i, (ptype)value[i]); \
+ } \
+}
+
+/*
+ * Similar to nvlist_print() but handles arrays slightly differently.
+ */
+void
+dump_nvlist(nvlist_t *list, int indent)
+{
+ nvpair_t *elem = NULL;
+ boolean_t bool_value;
+ nvlist_t *nvlist_value;
+ nvlist_t **nvlist_array_value;
+ uint_t i, count;
+
+ if (list == NULL) {
+ return;
+ }
+
+ while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(elem, &bool_value);
+ (void) printf("%*s%s: %s\n", indent, "",
+ nvpair_name(elem), bool_value ? "true" : "false");
+ break;
+
+ case DATA_TYPE_BYTE:
+ NVP(elem, byte, uchar_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT8:
+ NVP(elem, int8, int8_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT8:
+ NVP(elem, uint8, uint8_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT16:
+ NVP(elem, int16, int16_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT16:
+ NVP(elem, uint16, uint16_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT32:
+ NVP(elem, int32, int32_t, long, "%ld");
+ break;
+
+ case DATA_TYPE_UINT32:
+ NVP(elem, uint32, uint32_t, ulong_t, "%lu");
+ break;
+
+ case DATA_TYPE_INT64:
+ NVP(elem, int64, int64_t, longlong_t, "%lld");
+ break;
+
+ case DATA_TYPE_UINT64:
+ NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
+ break;
+
+ case DATA_TYPE_STRING:
+ NVP(elem, string, char *, char *, "'%s'");
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ NVPA(elem, byte_array, uchar_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ NVPA(elem, int8_array, int8_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT8_ARRAY:
+ NVPA(elem, uint8_array, uint8_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ NVPA(elem, int16_array, int16_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ NVPA(elem, uint16_array, uint16_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT32_ARRAY:
+ NVPA(elem, int32_array, int32_t, long, "%ld");
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+ "%llu");
+ break;
+
+ case DATA_TYPE_STRING_ARRAY:
+ NVPA(elem, string_array, char *, char *, "'%s'");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ (void) nvpair_value_nvlist(elem, &nvlist_value);
+ (void) printf("%*s%s:\n", indent, "",
+ nvpair_name(elem));
+ dump_nvlist(nvlist_value, indent + 4);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ (void) nvpair_value_nvlist_array(elem,
+ &nvlist_array_value, &count);
+ for (i = 0; i < count; i++) {
+ (void) printf("%*s%s[%u]:\n", indent, "",
+ nvpair_name(elem), i);
+ dump_nvlist(nvlist_array_value[i], indent + 4);
+ }
+ break;
+
+ default:
+ (void) printf(dgettext(TEXT_DOMAIN, "bad config type "
+ "%d for %s\n"), nvpair_type(elem),
+ nvpair_name(elem));
+ }
+ }
+}
+
+/*
+ * ======================================================================
+ * | |
+ * | Misc private interface. |
+ * | |
+ * ======================================================================
+ */
+
/*
* Determine if string 'value' matches 'nvp' value. The 'value' string is
* converted, depending on the type of 'nvp', prior to match. For numeric
diff --git a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
index e655e0d4069d..4c2615d924a5 100644
--- a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
+++ b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _LIBNVPAIR_H
#define _LIBNVPAIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#include <stdlib.h>
#include <stdio.h>
@@ -37,9 +34,158 @@
extern "C" {
#endif
-void nvlist_print(FILE *, nvlist_t *);
-int nvpair_value_match(nvpair_t *, int, char *, char **);
-int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
+/*
+ * All interfaces described in this file are private to Solaris, and
+ * are subject to change at any time and without notice. The public
+ * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR,
+ * are all imported from <sys/nvpair.h> included above.
+ */
+
+extern int nvpair_value_match(nvpair_t *, int, char *, char **);
+extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *,
+ char **);
+
+extern void nvlist_print(FILE *, nvlist_t *);
+extern void dump_nvlist(nvlist_t *, int);
+
+/*
+ * Private nvlist printing interface that allows the caller some control
+ * over output rendering (as opposed to nvlist_print and dump_nvlist).
+ *
+ * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc
+ * (NULL on failure); on return the cookie is set up for default formatting
+ * and rendering. Quote the cookie in subsequent customisation functions and
+ * then pass the cookie to nvlist_prt to render the nvlist. Finally,
+ * use nvlist_prtctl_free to release the cookie.
+ *
+ * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions
+ * we have a corresponding brace of functions that appoint replacement
+ * rendering functions:
+ *
+ * extern void nvlist_prtctl_xxx(nvlist_prtctl_t,
+ * void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ * xxxtype value))
+ *
+ * and
+ *
+ * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t,
+ * void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ * xxxtype value, uint_t count))
+ *
+ * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8"
+ * and char * for "string". The function that is appointed to render the
+ * specified datatype receives as arguments the cookie, the nvlist
+ * member name, the value of that member (or a pointer for array function),
+ * and (for array rendering functions) a count of the number of elements.
+ */
+
+typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */
+
+enum nvlist_indent_mode {
+ NVLIST_INDENT_ABS, /* Absolute indentation */
+ NVLIST_INDENT_TABBED /* Indent with tabstops */
+};
+
+extern nvlist_prtctl_t nvlist_prtctl_alloc(void);
+extern void nvlist_prtctl_free(nvlist_prtctl_t);
+extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t);
+
+/* Output stream */
+extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *);
+extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t);
+
+/* Indentation mode, start indent, indent increment; default tabbed/0/1 */
+extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode,
+ int, int);
+extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int);
+
+enum nvlist_prtctl_fmt {
+ NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */
+ NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */
+ NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */
+};
+
+extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt,
+ const char *);
+extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...);
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for single-valued nvlist members.
+ *
+ * A replacement function receives arguments as follows:
+ *
+ * nvlist_prtctl_t Print control structure; do not change preferences
+ * for this object from a print callback function.
+ *
+ * void * The function-private cookie argument registered
+ * when the replacement function was appointed.
+ *
+ * nvlist_t * The full nvlist that is being processed. The
+ * rendering function is called to render a single
+ * member (name and value passed as below) but it may
+ * want to reference or incorporate other aspects of
+ * the full nvlist.
+ *
+ * const char * Member name to render
+ *
+ * valtype Value of the member to render
+ *
+ * The function must return non-zero if it has rendered output for this
+ * member, or 0 if it wants to default to standard rendering for this
+ * one member.
+ */
+
+#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \
+ extern void funcname(nvlist_prtctl_t, \
+ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \
+ void *)
+
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *);
+
+#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for array-valued nvlist members.
+ *
+ * One additional argument is taken: uint_t for the number of array elements
+ *
+ * Return values as above.
+ */
+#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \
+ extern void funcname(nvlist_prtctl_t, \
+ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \
+ void *)
+
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **);
+
+#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */
#ifdef __cplusplus
}
diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h b/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
index 269687e18f5f..7a5f8a8570c6 100644
--- a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
+++ b/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _LIBUUTIL_H
@@ -29,6 +28,7 @@
#include <solaris.h>
#include <sys/types.h>
#include <stdarg.h>
+#include <stdio.h>
#ifdef __cplusplus
extern "C" {
@@ -143,12 +143,21 @@ extern int uu_open_tmp(const char *dir, uint_t uflags);
/*
* Convenience functions.
*/
+#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0]))
+
/*PRINTFLIKE1*/
extern char *uu_msprintf(const char *format, ...);
extern void *uu_zalloc(size_t);
extern char *uu_strdup(const char *);
extern void uu_free(void *);
+extern boolean_t uu_strcaseeq(const char *a, const char *b);
+extern boolean_t uu_streq(const char *a, const char *b);
+extern char *uu_strndup(const char *s, size_t n);
+extern boolean_t uu_strbw(const char *a, const char *b);
+extern void *uu_memdup(const void *buf, size_t sz);
+extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len);
+
/*
* Comparison function type definition.
* Developers should be careful in their use of the _private argument. If you
diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
index 05d8622871fa..2bef759d525e 100644
--- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
+++ b/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "libuutil_common.h"
@@ -67,6 +66,44 @@ uu_strdup(const char *str)
return (buf);
}
+/*
+ * Duplicate up to n bytes of a string. Kind of sort of like
+ * strdup(strlcpy(s, n)).
+ */
+char *
+uu_strndup(const char *s, size_t n)
+{
+ size_t len;
+ char *p;
+
+ len = strnlen(s, n);
+ p = uu_zalloc(len + 1);
+ if (p == NULL)
+ return (NULL);
+
+ if (len > 0)
+ (void) memcpy(p, s, len);
+ p[len] = '\0';
+
+ return (p);
+}
+
+/*
+ * Duplicate a block of memory. Combines malloc with memcpy, much as
+ * strdup combines malloc, strlen, and strcpy.
+ */
+void *
+uu_memdup(const void *buf, size_t sz)
+{
+ void *p;
+
+ p = uu_zalloc(sz);
+ if (p == NULL)
+ return (NULL);
+ (void) memcpy(p, buf, sz);
+ return (p);
+}
+
char *
uu_msprintf(const char *format, ...)
{
diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
index fb0c32bb8e47..507d4eb13087 100644
--- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
+++ b/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c
@@ -20,12 +20,9 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "libuutil_common.h"
#include <assert.h>
@@ -39,6 +36,7 @@
#include <sys/debug.h>
#include <thread.h>
#include <unistd.h>
+#include <ctype.h>
#if !defined(TEXT_DOMAIN)
#define TEXT_DOMAIN "SYS_TEST"
@@ -248,3 +246,30 @@ uu_init(void)
{
(void) pthread_atfork(uu_lockup, uu_release, uu_release_child);
}
+
+/*
+ * Dump a block of memory in hex+ascii, for debugging
+ */
+void
+uu_dump(FILE *out, const char *prefix, const void *buf, size_t len)
+{
+ const unsigned char *p = buf;
+ int i;
+
+ for (i = 0; i < len; i += 16) {
+ int j;
+
+ (void) fprintf(out, "%s", prefix);
+ for (j = 0; j < 16 && i + j < len; j++) {
+ (void) fprintf(out, "%2.2x ", p[i + j]);
+ }
+ for (; j < 16; j++) {
+ (void) fprintf(out, " ");
+ }
+ for (j = 0; j < 16 && i + j < len; j++) {
+ (void) fprintf(out, "%c",
+ isprint(p[i + j]) ? p[i + j] : '.');
+ }
+ (void) fprintf(out, "\n");
+ }
+}
diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c
new file mode 100644
index 000000000000..66afba05e849
--- /dev/null
+++ b/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * String helper functions
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <ctype.h>
+#include "libuutil.h"
+
+/* Return true if strings are equal */
+boolean_t
+uu_streq(const char *a, const char *b)
+{
+ return (strcmp(a, b) == 0);
+}
+
+/* Return true if strings are equal, case-insensitively */
+boolean_t
+uu_strcaseeq(const char *a, const char *b)
+{
+ return (strcasecmp(a, b) == 0);
+}
+
+/* Return true if string a Begins With string b */
+boolean_t
+uu_strbw(const char *a, const char *b)
+{
+ return (strncmp(a, b, strlen(b)) == 0);
+}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
index 5fad609ae385..fff63dde14c3 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
@@ -20,8 +20,8 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _LIBZFS_H
@@ -66,7 +66,6 @@ enum {
EZFS_BADSTREAM, /* bad backup stream */
EZFS_DSREADONLY, /* dataset is readonly */
EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */
- EZFS_VOLHASDATA, /* volume already contains data */
EZFS_INVALIDNAME, /* invalid dataset name */
EZFS_BADRESTORE, /* unable to restore to destination */
EZFS_BADBACKUP, /* backup failed */
@@ -85,17 +84,15 @@ enum {
EZFS_UMOUNTFAILED, /* failed to unmount dataset */
EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */
EZFS_SHARENFSFAILED, /* share(1M) failed */
- EZFS_DEVLINKS, /* failed to create zvol links */
EZFS_PERM, /* permission denied */
EZFS_NOSPC, /* out of space */
+ EZFS_FAULT, /* bad address */
EZFS_IO, /* I/O error */
EZFS_INTR, /* signal received */
EZFS_ISSPARE, /* device is a hot spare */
EZFS_INVALCONFIG, /* invalid vdev configuration */
EZFS_RECURSIVE, /* recursive dependency */
EZFS_NOHISTORY, /* no history object */
- EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
- EZFS_SHAREISCSIFAILED, /* iscsitgtd failed request to share */
EZFS_POOLPROPS, /* couldn't retrieve pool props */
EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */
EZFS_POOL_INVALARG, /* invalid argument for this pool operation */
@@ -103,12 +100,10 @@ enum {
EZFS_OPENFAILED, /* open of device failed */
EZFS_NOCAP, /* couldn't get capacity */
EZFS_LABELFAILED, /* write of label failed */
- EZFS_ISCSISVCUNAVAIL, /* iscsi service unavailable */
EZFS_BADWHO, /* invalid permission who */
EZFS_BADPERM, /* invalid permission */
EZFS_BADPERMSET, /* invalid permission set name */
EZFS_NODELEGATION, /* delegated administration is disabled */
- EZFS_PERMRDONLY, /* pemissions are readonly */
EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */
EZFS_SHARESMBFAILED, /* failed to share over smb */
EZFS_BADCACHE, /* bad cache file */
@@ -117,6 +112,17 @@ enum {
EZFS_NOTSUP, /* ops not supported on this dataset */
EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
+ EZFS_REFTAG_RELE, /* snapshot release: tag not found */
+ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */
+ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */
+ EZFS_PIPEFAILED, /* pipe create failed */
+ EZFS_THREADCREATEFAILED, /* thread create failed */
+ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
+ EZFS_SCRUBBING, /* currently scrubbing */
+ EZFS_NO_SCRUB, /* no active scrub */
+ EZFS_DIFF, /* general failure of zfs diff */
+ EZFS_DIFFDATA, /* bad zfs diff data */
+ EZFS_POOLREADONLY, /* pool is in read-only mode */
EZFS_UNKNOWN
};
@@ -211,11 +217,19 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
extern int zpool_destroy(zpool_handle_t *);
extern int zpool_add(zpool_handle_t *, nvlist_t *);
+typedef struct splitflags {
+ /* do not split, but return the config that would be split off */
+ int dryrun : 1;
+
+ /* after splitting, import the pool */
+ int import : 1;
+} splitflags_t;
+
/*
* Functions to manipulate pool and vdev state
*/
-extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
-extern int zpool_clear(zpool_handle_t *, const char *);
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
@@ -224,13 +238,17 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *,
const char *, nvlist_t *, int);
extern int zpool_vdev_detach(zpool_handle_t *, const char *);
extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
+ splitflags_t);
-extern int zpool_vdev_fault(zpool_handle_t *, uint64_t);
-extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t);
+extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
boolean_t *, boolean_t *);
+extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
+ boolean_t *, boolean_t *, boolean_t *);
extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
/*
@@ -284,6 +302,7 @@ typedef enum {
ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device online */
+ ZPOOL_STATUS_REMOVED_DEV, /* removed device */
/*
* Finally, the following indicates a healthy pool.
@@ -293,6 +312,7 @@ typedef enum {
extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
extern zpool_status_t zpool_import_status(nvlist_t *, char **);
+extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
/*
* Statistics and configuration functions.
@@ -309,35 +329,53 @@ extern int zpool_export_force(zpool_handle_t *);
extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
char *altroot);
extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
- nvlist_t *, boolean_t);
+ nvlist_t *, int);
/*
* Search for pools to import
*/
+
+typedef struct importargs {
+ char **path; /* a list of paths to search */
+ int paths; /* number of paths to search */
+ char *poolname; /* name of a pool to find */
+ uint64_t guid; /* guid of a pool to find */
+ char *cachefile; /* cachefile to use for import */
+ int can_be_active : 1; /* can the pool be active? */
+ int unique : 1; /* does 'poolname' already exist? */
+ int exists : 1; /* set on return if pool already exists */
+} importargs_t;
+
+extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
+
+/* legacy pool search routines */
extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
char *, uint64_t);
-extern nvlist_t *zpool_find_import_byname(libzfs_handle_t *, int, char **,
- char *);
-extern nvlist_t *zpool_find_import_byguid(libzfs_handle_t *, int, char **,
- uint64_t);
-extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);
/*
* Miscellaneous pool functions
*/
struct zfs_cmd;
-extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
+extern const char *zfs_history_event_names[LOG_END];
+
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+ boolean_t verbose);
extern int zpool_upgrade(zpool_handle_t *, uint64_t);
extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+ nvlist_t ***, uint_t *);
extern void zpool_set_history_str(const char *subcommand, int argc,
char **argv, char *history_str);
extern int zpool_stage_history(libzfs_handle_t *, const char *);
extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
size_t len);
extern int zfs_ioctl(libzfs_handle_t *, unsigned long, struct zfs_cmd *);
-extern int zpool_get_physpath(zpool_handle_t *, char *);
+extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+ nvlist_t *);
+
/*
* Basic handle manipulations. These functions do not create or destroy the
* underlying datasets, only the references to them.
@@ -368,6 +406,8 @@ extern const char *zfs_prop_to_name(zfs_prop_t);
extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
zprop_source_t *, char *, size_t, boolean_t);
+extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
+ boolean_t);
extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
zprop_source_t *, char *, size_t);
extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
@@ -375,10 +415,11 @@ extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal);
extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
-extern int zfs_prop_inherit(zfs_handle_t *, const char *);
+extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
extern const char *zfs_prop_values(zfs_prop_t);
extern int zfs_prop_is_string(zfs_prop_t prop);
extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
typedef struct zprop_list {
int pl_prop;
@@ -386,10 +427,11 @@ typedef struct zprop_list {
struct zprop_list *pl_next;
boolean_t pl_all;
size_t pl_width;
+ size_t pl_recvd_width;
boolean_t pl_fixed;
} zprop_list_t;
-extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **);
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t);
extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
#define ZFS_MOUNTPOINT_NONE "none"
@@ -413,13 +455,24 @@ extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
zfs_type_t);
extern void zprop_free_list(zprop_list_t *);
+#define ZFS_GET_NCOLS 5
+
+typedef enum {
+ GET_COL_NONE,
+ GET_COL_NAME,
+ GET_COL_PROPERTY,
+ GET_COL_VALUE,
+ GET_COL_RECVD,
+ GET_COL_SOURCE
+} zfs_get_column_t;
+
/*
* Functions for printing zfs or zpool properties
*/
typedef struct zprop_get_cbdata {
int cb_sources;
- int cb_columns[4];
- int cb_colwidths[5];
+ zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
+ int cb_colwidths[ZFS_GET_NCOLS + 1];
boolean_t cb_scripted;
boolean_t cb_literal;
boolean_t cb_first;
@@ -428,12 +481,8 @@ typedef struct zprop_get_cbdata {
} zprop_get_cbdata_t;
void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
- const char *, const char *, zprop_source_t, const char *);
-
-#define GET_COL_NAME 1
-#define GET_COL_PROPERTY 2
-#define GET_COL_VALUE 3
-#define GET_COL_SOURCE 4
+ const char *, const char *, zprop_source_t, const char *,
+ const char *);
/*
* Iterator functions.
@@ -444,6 +493,18 @@ extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
+
+typedef struct get_all_cb {
+ zfs_handle_t **cb_handles;
+ size_t cb_alloc;
+ size_t cb_used;
+ boolean_t cb_verbose;
+ int (*cb_getone)(zfs_handle_t *, void *);
+} get_all_cb_t;
+
+void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
+int libzfs_dataset_cmp(const void *, const void *);
/*
* Functions to create and destroy datasets.
@@ -451,21 +512,54 @@ extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
nvlist_t *);
extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
-extern int zfs_destroy(zfs_handle_t *);
-extern int zfs_destroy_snaps(zfs_handle_t *, char *);
+extern int zfs_destroy(zfs_handle_t *, boolean_t);
+extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
-extern int zfs_send(zfs_handle_t *, const char *, const char *,
- boolean_t, boolean_t, boolean_t, boolean_t, int);
+
+typedef struct sendflags {
+ /* print informational messages (ie, -v was specified) */
+ int verbose : 1;
+
+ /* recursive send (ie, -R) */
+ int replicate : 1;
+
+ /* for incrementals, do all intermediate snapshots */
+ int doall : 1; /* (ie, -I) */
+
+ /* if dataset is a clone, do incremental from its origin */
+ int fromorigin : 1;
+
+ /* do deduplication */
+ int dedup : 1;
+
+ /* send properties (ie, -p) */
+ int props : 1;
+} sendflags_t;
+
+typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
+
+extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+ sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+ void *cb_arg, nvlist_t **debugnvp);
+
extern int zfs_promote(zfs_handle_t *);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
+ boolean_t, boolean_t, int, uint64_t, uint64_t);
+extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
+extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
-extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
- zfs_userspace_cb_t func, void *arg);
+extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
+ zfs_userspace_cb_t, void *);
+
+extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
+extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
typedef struct recvflags {
/* print informational messages (ie, -v was specified) */
@@ -474,6 +568,12 @@ typedef struct recvflags {
/* the destination is a prefix, not the exact fs (ie, -d) */
int isprefix : 1;
+ /*
+ * Only the tail of the sent snapshot path is appended to the
+ * destination to determine the received snapshot name (ie, -e).
+ */
+ int istail : 1;
+
/* do not actually do the recv, just check if it would work (ie, -n) */
int dryrun : 1;
@@ -493,6 +593,15 @@ typedef struct recvflags {
extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t,
int, avl_tree_t *);
+typedef enum diff_flags {
+ ZFS_DIFF_PARSEABLE = 0x1,
+ ZFS_DIFF_TIMESTAMP = 0x2,
+ ZFS_DIFF_CLASSIFY = 0x4
+} diff_flags_t;
+
+extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
+ int);
+
/*
* Miscellaneous functions.
*/
@@ -534,12 +643,6 @@ extern int zfs_unshareall_nfs(zfs_handle_t *);
extern int zfs_unshareall_smb(zfs_handle_t *);
extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
extern int zfs_unshareall(zfs_handle_t *);
-extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
-extern int zfs_share_iscsi(zfs_handle_t *);
-extern int zfs_unshare_iscsi(zfs_handle_t *);
-#ifdef TODO
-extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
-#endif
extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
void *, void *, int, zfs_share_op_t);
@@ -572,15 +675,10 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
boolean_t *);
/*
- * ftyp special. Read the label from a given device.
+ * Label manipulation.
*/
extern int zpool_read_label(int, nvlist_t **);
-
-/*
- * Create and remove zvol /dev links.
- */
-extern int zpool_create_zvol_links(zpool_handle_t *);
-extern int zpool_remove_zvol_links(zpool_handle_t *);
+extern int zpool_clear_label(int);
/* is this zvol valid for use as a dump device? */
extern int zvol_check_dump_config(char *);
@@ -601,10 +699,21 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
-#ifdef __FreeBSD__
+/*
+ * Mappings between vdev and FRU.
+ */
+extern void libzfs_fru_refresh(libzfs_handle_t *);
+extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
+extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
+extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
+ const char *);
+extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
+extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
+
+#ifndef sun
extern int zmount(const char *, const char *, int, char *, char *, int, char *,
int);
-#endif
+#endif /* !sun */
#ifdef __cplusplus
}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
index 6fa196710983..4328d38a2c36 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Portions Copyright 2007 Ramprakash Jelari
@@ -116,32 +116,7 @@ changelist_prefix(prop_changelist_t *clp)
if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
continue;
- if (ZFS_IS_VOLUME(cn->cn_handle)) {
- switch (clp->cl_realprop) {
- case ZFS_PROP_NAME:
- /*
- * If this was a rename, unshare the zvol, and
- * remove the /dev/zvol links.
- */
- (void) zfs_unshare_iscsi(cn->cn_handle);
-
- if (zvol_remove_link(cn->cn_handle->zfs_hdl,
- cn->cn_handle->zfs_name) != 0) {
- ret = -1;
- cn->cn_needpost = B_FALSE;
- (void) zfs_share_iscsi(cn->cn_handle);
- }
- break;
-
- case ZFS_PROP_VOLSIZE:
- /*
- * If this was a change to the volume size, we
- * need to unshare and reshare the volume.
- */
- (void) zfs_unshare_iscsi(cn->cn_handle);
- break;
- }
- } else {
+ if (!ZFS_IS_VOLUME(cn->cn_handle)) {
/*
* Do the property specific processing.
*/
@@ -234,32 +209,8 @@ changelist_postfix(prop_changelist_t *clp)
zfs_refresh_properties(cn->cn_handle);
- if (ZFS_IS_VOLUME(cn->cn_handle)) {
- /*
- * If we're doing a rename, recreate the /dev/zvol
- * links.
- */
- if (clp->cl_realprop == ZFS_PROP_NAME &&
- zvol_create_link(cn->cn_handle->zfs_hdl,
- cn->cn_handle->zfs_name) != 0) {
- errors++;
- } else if (cn->cn_shared ||
- clp->cl_prop == ZFS_PROP_SHAREISCSI) {
- if (zfs_prop_get(cn->cn_handle,
- ZFS_PROP_SHAREISCSI, shareopts,
- sizeof (shareopts), NULL, NULL, 0,
- B_FALSE) == 0 &&
- strcmp(shareopts, "off") == 0) {
- errors +=
- zfs_unshare_iscsi(cn->cn_handle);
- } else {
- errors +=
- zfs_share_iscsi(cn->cn_handle);
- }
- }
-
+ if (ZFS_IS_VOLUME(cn->cn_handle))
continue;
- }
/*
* Remount if previously mounted or mountpoint was legacy,
@@ -508,6 +459,14 @@ change_one(zfs_handle_t *zhp, void *data)
&idx);
uu_list_insert(clp->cl_list, cn, idx);
} else {
+ /*
+ * Add this child to beginning of the list. Children
+ * below this one in the hierarchy will get added above
+ * this one in the list. This produces a list in
+ * reverse dataset name order.
+ * This is necessary when the original mountpoint
+ * is legacy or none.
+ */
ASSERT(!clp->cl_alldependents);
verify(uu_list_insert_before(clp->cl_list,
uu_list_first(clp->cl_list), cn) == 0);
@@ -574,6 +533,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
zfs_handle_t *temp;
char property[ZFS_MAXPROPLEN];
uu_compare_fn_t *compare = NULL;
+ boolean_t legacy = B_FALSE;
if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL)
return (NULL);
@@ -586,8 +546,19 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED ||
prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS ||
prop == ZFS_PROP_SHARESMB) {
- compare = compare_mountpoints;
- clp->cl_sorted = B_TRUE;
+
+ if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+ property, sizeof (property),
+ NULL, NULL, 0, B_FALSE) == 0 &&
+ (strcmp(property, "legacy") == 0 ||
+ strcmp(property, "none") == 0)) {
+
+ legacy = B_TRUE;
+ }
+ if (!legacy) {
+ compare = compare_mountpoints;
+ clp->cl_sorted = B_TRUE;
+ }
}
clp->cl_pool = uu_list_pool_create("changelist_pool",
@@ -638,8 +609,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
clp->cl_prop != ZFS_PROP_SHARENFS &&
- clp->cl_prop != ZFS_PROP_SHARESMB &&
- clp->cl_prop != ZFS_PROP_SHAREISCSI)
+ clp->cl_prop != ZFS_PROP_SHARESMB)
return (clp);
/*
@@ -695,6 +665,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
(void) uu_list_find(clp->cl_list, cn, NULL, &idx);
uu_list_insert(clp->cl_list, cn, idx);
} else {
+ /*
+ * Add the target dataset to the end of the list.
+ * The list is not really unsorted. The list will be
+ * in reverse dataset name order. This is necessary
+ * when the original mountpoint is legacy or none.
+ */
verify(uu_list_insert_after(clp->cl_list,
uu_list_last(clp->cl_list), cn) == 0);
}
@@ -703,11 +679,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
* If the mountpoint property was previously 'legacy', or 'none',
* record it as the behavior of changelist_postfix() will be different.
*/
- if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) &&
- (zfs_prop_get(zhp, prop, property, sizeof (property),
- NULL, NULL, 0, B_FALSE) == 0 &&
- (strcmp(property, "legacy") == 0 ||
- strcmp(property, "none") == 0))) {
+ if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) {
/*
* do not automatically mount ex-legacy datasets if
* we specifically set canmount to noauto
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
index 94640d1b128c..dc27238c9cf3 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* The pool configuration repository is stored in /etc/zfs/zpool.cache as a
* single packed nvlist. While it would be nice to just read in this
@@ -313,21 +311,33 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data)
zpool_handle_t *zhp;
int ret;
- if (namespace_reload(hdl) != 0)
+ /*
+ * If someone makes a recursive call to zpool_iter(), we want to avoid
+ * refreshing the namespace because that will invalidate the parent
+ * context. We allow recursive calls, but simply re-use the same
+ * namespace AVL tree.
+ */
+ if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0)
return (-1);
+ hdl->libzfs_pool_iter++;
for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
- if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0)
+ if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
+ hdl->libzfs_pool_iter--;
return (-1);
+ }
if (zhp == NULL)
continue;
- if ((ret = func(zhp, data)) != 0)
+ if ((ret = func(zhp, data)) != 0) {
+ hdl->libzfs_pool_iter--;
return (ret);
+ }
}
+ hdl->libzfs_pool_iter--;
return (0);
}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
index 803746a6967b..824834e46766 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
@@ -20,11 +20,10 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
*/
-#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <libintl.h>
@@ -38,13 +37,13 @@
#include <fcntl.h>
#include <sys/mntent.h>
#include <sys/mount.h>
-#include <sys/avl.h>
#include <priv.h>
#include <pwd.h>
#include <grp.h>
#include <stddef.h>
#include <idmap.h>
+#include <sys/dnode.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/misc.h>
@@ -55,7 +54,6 @@
#include "libzfs_impl.h"
#include "zfs_deleg.h"
-static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
static int userquota_propname_decode(const char *propname, boolean_t zoned,
zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
@@ -126,13 +124,14 @@ path_to_str(const char *path, int types)
* provide a more meaningful error message. We call zfs_error_aux() to
* explain exactly why the name was not valid.
*/
-static int
+int
zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
boolean_t modifying)
{
namecheck_err_t why;
char what;
+ (void) zfs_prop_get_table();
if (dataset_namecheck(path, &why, &what) != 0) {
if (hdl != NULL) {
switch (why) {
@@ -318,6 +317,7 @@ zpool_free_handles(libzfs_handle_t *hdl)
/*
* Utility function to gather stats (objset and zpl) for the given object.
*/
+static int
get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -336,6 +336,44 @@ get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
return (0);
}
+/*
+ * Utility function to get the received properties of the given object.
+ */
+static int
+get_recvd_props_ioctl(zfs_handle_t *zhp)
+{
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+ nvlist_t *recvdprops;
+ zfs_cmd_t zc = { 0 };
+ int err;
+
+ if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
+ return (-1);
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+ while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
+ if (errno == ENOMEM) {
+ if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+ return (-1);
+ }
+ } else {
+ zcmd_free_nvlists(&zc);
+ return (-1);
+ }
+ }
+
+ err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
+ zcmd_free_nvlists(&zc);
+ if (err != 0)
+ return (-1);
+
+ nvlist_free(zhp->zfs_recvd_props);
+ zhp->zfs_recvd_props = recvdprops;
+
+ return (0);
+}
+
static int
put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
@@ -397,70 +435,8 @@ zfs_refresh_properties(zfs_handle_t *zhp)
static int
make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
- char *logstr;
- libzfs_handle_t *hdl = zhp->zfs_hdl;
-
- /*
- * Preserve history log string.
- * any changes performed here will be
- * logged as an internal event.
- */
- logstr = zhp->zfs_hdl->libzfs_log_str;
- zhp->zfs_hdl->libzfs_log_str = NULL;
-
-top:
- if (put_stats_zhdl(zhp, zc) != 0) {
- zhp->zfs_hdl->libzfs_log_str = logstr;
+ if (put_stats_zhdl(zhp, zc) != 0)
return (-1);
- }
-
-
- if (zhp->zfs_dmustats.dds_inconsistent) {
- zfs_cmd_t zc2 = { 0 };
-
- /*
- * If it is dds_inconsistent, then we've caught it in
- * the middle of a 'zfs receive' or 'zfs destroy', and
- * it is inconsistent from the ZPL's point of view, so
- * can't be mounted. However, it could also be that we
- * have crashed in the middle of one of those
- * operations, in which case we need to get rid of the
- * inconsistent state. We do that by either rolling
- * back to the previous snapshot (which will fail if
- * there is none), or destroying the filesystem. Note
- * that if we are still in the middle of an active
- * 'receive' or 'destroy', then the rollback and destroy
- * will fail with EBUSY and we will drive on as usual.
- */
-
- (void) strlcpy(zc2.zc_name, zhp->zfs_name,
- sizeof (zc2.zc_name));
-
- if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
- (void) zvol_remove_link(hdl, zhp->zfs_name);
- zc2.zc_objset_type = DMU_OST_ZVOL;
- } else {
- zc2.zc_objset_type = DMU_OST_ZFS;
- }
-
- /*
- * If we can successfully destroy it, pretend that it
- * never existed.
- */
- if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc2) == 0) {
- zhp->zfs_hdl->libzfs_log_str = logstr;
- errno = ENOENT;
- return (-1);
- }
- /* If we can successfully roll it back, reset the stats */
- if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc2) == 0) {
- if (get_stats_ioctl(zhp, zc) != 0) {
- zhp->zfs_hdl->libzfs_log_str = logstr;
- return (-1);
- }
- goto top;
- }
- }
/*
* We've managed to open the dataset and gather statistics. Determine
@@ -482,8 +458,9 @@ top:
else
abort(); /* we should never see any other types */
- zhp->zfs_hdl->libzfs_log_str = logstr;
- zhp->zpool_hdl = zpool_handle(zhp);
+ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
+ return (-1);
+
return (0);
}
@@ -585,6 +562,7 @@ zfs_close(zfs_handle_t *zhp)
free(zhp->zfs_mntopts);
nvlist_free(zhp->zfs_props);
nvlist_free(zhp->zfs_user_props);
+ nvlist_free(zhp->zfs_recvd_props);
free(zhp);
}
@@ -878,9 +856,14 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
goto error;
}
+ /*
+ * Encode the prop name as
+ * userquota@<hex-rid>-domain, to make it easy
+ * for the kernel to decode.
+ */
(void) snprintf(newpropname, sizeof (newpropname),
- "%s%s", zfs_userquota_prop_prefixes[uqtype],
- domain);
+ "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
+ (longlong_t)rid, domain);
valary[0] = uqtype;
valary[1] = rid;
valary[2] = intval;
@@ -956,19 +939,66 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
}
break;
- case ZFS_PROP_SHAREISCSI:
- if (strcmp(strval, "off") != 0 &&
- strcmp(strval, "on") != 0 &&
- strcmp(strval, "type=disk") != 0) {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "'%s' must be 'on', 'off', or 'type=disk'"),
- propname);
- (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
- goto error;
+ case ZFS_PROP_MLSLABEL:
+ {
+#ifdef sun
+ /*
+ * Verify the mlslabel string and convert to
+ * internal hex label string.
+ */
+
+ m_label_t *new_sl;
+ char *hex = NULL; /* internal label string */
+
+ /* Default value is already OK. */
+ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+ break;
+
+ /* Verify the label can be converted to binary form */
+ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
+ (str_to_label(strval, &new_sl, MAC_LABEL,
+ L_NO_CORRECTION, NULL) == -1)) {
+ goto badlabel;
+ }
+
+ /* Now translate to hex internal label string */
+ if (label_to_str(new_sl, &hex, M_INTERNAL,
+ DEF_NAMES) != 0) {
+ if (hex)
+ free(hex);
+ goto badlabel;
}
+ m_label_free(new_sl);
+
+ /* If string is already in internal form, we're done. */
+ if (strcmp(strval, hex) == 0) {
+ free(hex);
+ break;
+ }
+
+ /* Replace the label string with the internal form. */
+ (void) nvlist_remove(ret, zfs_prop_to_name(prop),
+ DATA_TYPE_STRING);
+ verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
+ hex) == 0);
+ free(hex);
break;
+badlabel:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid mlslabel '%s'"), strval);
+ (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+ m_label_free(new_sl); /* OK if null */
+#else /* !sun */
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "mlslabel is not supported on FreeBSD"));
+ (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+#endif /* !sun */
+ goto error;
+
+ }
+
case ZFS_PROP_MOUNTPOINT:
{
namecheck_err_t why;
@@ -1187,39 +1217,130 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
+ return (ret);
+
+error:
+ nvlist_free(ret);
+ return (NULL);
+}
+
+int
+zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
+{
+ uint64_t old_volsize;
+ uint64_t new_volsize;
+ uint64_t old_reservation;
+ uint64_t new_reservation;
+ zfs_prop_t resv_prop;
/*
* If this is an existing volume, and someone is setting the volsize,
* make sure that it matches the reservation, or add it if necessary.
*/
- if (zhp != NULL && type == ZFS_TYPE_VOLUME &&
- nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
- &intval) == 0) {
- uint64_t old_volsize = zfs_prop_get_int(zhp,
- ZFS_PROP_VOLSIZE);
- uint64_t old_reservation;
- uint64_t new_reservation;
- zfs_prop_t resv_prop;
+ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
+ if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
+ return (-1);
+ old_reservation = zfs_prop_get_int(zhp, resv_prop);
+ if ((zvol_volsize_to_reservation(old_volsize, zhp->zfs_props) !=
+ old_reservation) || nvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(resv_prop), &new_reservation) != ENOENT) {
+ return (0);
+ }
+ if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+ &new_volsize) != 0)
+ return (-1);
+ new_reservation = zvol_volsize_to_reservation(new_volsize,
+ zhp->zfs_props);
+ if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
+ new_reservation) != 0) {
+ (void) no_memory(zhp->zfs_hdl);
+ return (-1);
+ }
+ return (1);
+}
- if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
- goto error;
- old_reservation = zfs_prop_get_int(zhp, resv_prop);
+void
+zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
+ char *errbuf)
+{
+ switch (err) {
- if (old_volsize == old_reservation &&
- nvlist_lookup_uint64(ret, zfs_prop_to_name(resv_prop),
- &new_reservation) != 0) {
- if (nvlist_add_uint64(ret,
- zfs_prop_to_name(resv_prop), intval) != 0) {
- (void) no_memory(hdl);
- goto error;
- }
+ case ENOSPC:
+ /*
+ * For quotas and reservations, ENOSPC indicates
+ * something different; setting a quota or reservation
+ * doesn't use any disk space.
+ */
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_REFQUOTA:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "size is less than current used or "
+ "reserved space"));
+ (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+ break;
+
+ case ZFS_PROP_RESERVATION:
+ case ZFS_PROP_REFRESERVATION:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "size is greater than available space"));
+ (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+ break;
+
+ default:
+ (void) zfs_standard_error(hdl, err, errbuf);
+ break;
}
- }
- return (ret);
+ break;
-error:
- nvlist_free(ret);
- return (NULL);
+ case EBUSY:
+ (void) zfs_standard_error(hdl, EBUSY, errbuf);
+ break;
+
+ case EROFS:
+ (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
+ break;
+
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool and or dataset must be upgraded to set this "
+ "property or value"));
+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+ break;
+
+ case ERANGE:
+ if (prop == ZFS_PROP_COMPRESSION) {
+ (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "property setting is not allowed on "
+ "bootable datasets"));
+ (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
+ } else {
+ (void) zfs_standard_error(hdl, err, errbuf);
+ }
+ break;
+
+ case EINVAL:
+ if (prop == ZPROP_INVAL) {
+ (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+ } else {
+ (void) zfs_standard_error(hdl, err, errbuf);
+ }
+ break;
+
+ case EOVERFLOW:
+ /*
+ * This platform can't address a volume this big.
+ */
+#ifdef _ILP32
+ if (prop == ZFS_PROP_VOLSIZE) {
+ (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
+ break;
+ }
+#endif
+ /* FALLTHROUGH */
+ default:
+ (void) zfs_standard_error(hdl, err, errbuf);
+ }
}
/*
@@ -1237,6 +1358,7 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
zfs_prop_t prop;
boolean_t do_prefix;
uint64_t idx;
+ int added_resv;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -1260,17 +1382,22 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
/* We don't support those properties on FreeBSD. */
switch (prop) {
case ZFS_PROP_DEVICES:
- case ZFS_PROP_SHAREISCSI:
case ZFS_PROP_ISCSIOPTIONS:
case ZFS_PROP_XATTR:
case ZFS_PROP_VSCAN:
case ZFS_PROP_NBMAND:
+ case ZFS_PROP_MLSLABEL:
(void) snprintf(errbuf, sizeof (errbuf),
"property '%s' not supported on FreeBSD", propname);
ret = zfs_error(hdl, EZFS_PERM, errbuf);
goto error;
}
+ if (prop == ZFS_PROP_VOLSIZE) {
+ if ((added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1)
+ goto error;
+ }
+
if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
goto error;
@@ -1304,78 +1431,22 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
if (ret != 0) {
- switch (errno) {
-
- case ENOSPC:
- /*
- * For quotas and reservations, ENOSPC indicates
- * something different; setting a quota or reservation
- * doesn't use any disk space.
- */
- switch (prop) {
- case ZFS_PROP_QUOTA:
- case ZFS_PROP_REFQUOTA:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "size is less than current used or "
- "reserved space"));
- (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
- break;
-
- case ZFS_PROP_RESERVATION:
- case ZFS_PROP_REFRESERVATION:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "size is greater than available space"));
- (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
- break;
-
- default:
- (void) zfs_standard_error(hdl, errno, errbuf);
- break;
- }
- break;
-
- case EBUSY:
- if (prop == ZFS_PROP_VOLBLOCKSIZE)
- (void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
- else
- (void) zfs_standard_error(hdl, EBUSY, errbuf);
- break;
-
- case EROFS:
- (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
- break;
-
- case ENOTSUP:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "pool and or dataset must be upgraded to set this "
- "property or value"));
- (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
- break;
-
- case ERANGE:
- if (prop == ZFS_PROP_COMPRESSION) {
- (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "property setting is not allowed on "
- "bootable datasets"));
- (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
- } else {
- (void) zfs_standard_error(hdl, errno, errbuf);
- }
- break;
-
- case EOVERFLOW:
- /*
- * This platform can't address a volume this big.
- */
-#ifdef _ILP32
- if (prop == ZFS_PROP_VOLSIZE) {
- (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
- break;
- }
-#endif
- /* FALLTHROUGH */
- default:
- (void) zfs_standard_error(hdl, errno, errbuf);
+ zfs_setprop_error(hdl, prop, errno, errbuf);
+ if (added_resv && errno == ENOSPC) {
+ /* clean up the volsize property we tried to set */
+ uint64_t old_volsize = zfs_prop_get_int(zhp,
+ ZFS_PROP_VOLSIZE);
+ nvlist_free(nvl);
+ zcmd_free_nvlists(&zc);
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ goto error;
+ if (nvlist_add_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+ old_volsize) != 0)
+ goto error;
+ if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
+ goto error;
+ (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
}
} else {
if (do_prefix)
@@ -1398,10 +1469,11 @@ error:
}
/*
- * Given a property, inherit the value from the parent dataset.
+ * Given a property, inherit the value from the parent dataset, or if received
+ * is TRUE, revert to the received value, if any.
*/
int
-zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
+zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
{
zfs_cmd_t zc = { 0 };
int ret;
@@ -1413,6 +1485,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot inherit %s for '%s'"), propname, zhp->zfs_name);
+ zc.zc_cookie = received;
if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
/*
* For user properties, the amount of work we have to do is very
@@ -1439,7 +1512,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
if (zfs_prop_readonly(prop))
return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
- if (!zfs_prop_inheritable(prop))
+ if (!zfs_prop_inheritable(prop) && !received)
return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
/*
@@ -1449,7 +1522,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
/*
- * Normalize the name, to get rid of shorthand abbrevations.
+ * Normalize the name, to get rid of shorthand abbreviations.
*/
propname = zfs_prop_to_name(prop);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -1544,6 +1617,26 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
return (value);
}
+static boolean_t
+zfs_is_recvd_props_mode(zfs_handle_t *zhp)
+{
+ return (zhp->zfs_props == zhp->zfs_recvd_props);
+}
+
+static void
+zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+ *cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
+ zhp->zfs_props = zhp->zfs_recvd_props;
+}
+
+static void
+zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+ zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie;
+ *cookie = 0;
+}
+
/*
* Internal function for getting a numeric property. Both zfs_prop_get() and
* zfs_prop_get_int() are built using this interface.
@@ -1562,6 +1655,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
struct mnttab mnt;
char *mntopt_on = NULL;
char *mntopt_off = NULL;
+ boolean_t received = zfs_is_recvd_props_mode(zhp);
*source = NULL;
@@ -1637,6 +1731,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
case ZFS_PROP_NBMAND:
*val = getprop_uint64(zhp, prop, source);
+ if (received)
+ break;
+
if (hasmntopt(&mnt, mntopt_on) && !*val) {
*val = B_TRUE;
if (src)
@@ -1649,22 +1746,17 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
break;
case ZFS_PROP_CANMOUNT:
- *val = getprop_uint64(zhp, prop, source);
- if (*val != ZFS_CANMOUNT_ON)
- *source = zhp->zfs_name;
- else
- *source = ""; /* default */
- break;
-
+ case ZFS_PROP_VOLSIZE:
case ZFS_PROP_QUOTA:
case ZFS_PROP_REFQUOTA:
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
*val = getprop_uint64(zhp, prop, source);
- if (*val == 0)
- *source = ""; /* default */
- else
+
+ if (*source == NULL) {
+ /* not default, must be local */
*source = zhp->zfs_name;
+ }
break;
case ZFS_PROP_MOUNTED:
@@ -1685,21 +1777,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
zcmd_free_nvlists(&zc);
- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
- "unable to get %s property"),
- zfs_prop_to_name(prop));
- return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION,
- dgettext(TEXT_DOMAIN, "internal error")));
+ return (-1);
}
if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
val) != 0) {
zcmd_free_nvlists(&zc);
- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
- "unable to get %s property"),
- zfs_prop_to_name(prop));
- return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM,
- dgettext(TEXT_DOMAIN, "internal error")));
+ return (-1);
}
if (zplprops)
nvlist_free(zplprops);
@@ -1714,11 +1798,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
/*
* If we tried to use a default value for a
* readonly property, it means that it was not
- * present; return an error.
+ * present.
*/
if (zfs_prop_readonly(prop) &&
- *source && (*source)[0] == '\0') {
- return (-1);
+ *source != NULL && (*source)[0] == '\0') {
+ *source = NULL;
}
break;
@@ -1748,6 +1832,8 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
*srctype = ZPROP_SRC_NONE;
} else if (source[0] == '\0') {
*srctype = ZPROP_SRC_DEFAULT;
+ } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
+ *srctype = ZPROP_SRC_RECEIVED;
} else {
if (strcmp(source, zhp->zfs_name) == 0) {
*srctype = ZPROP_SRC_LOCAL;
@@ -1759,6 +1845,43 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
}
+int
+zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
+ size_t proplen, boolean_t literal)
+{
+ zfs_prop_t prop;
+ int err = 0;
+
+ if (zhp->zfs_recvd_props == NULL)
+ if (get_recvd_props_ioctl(zhp) != 0)
+ return (-1);
+
+ prop = zfs_name_to_prop(propname);
+
+ if (prop != ZPROP_INVAL) {
+ uint64_t cookie;
+ if (!nvlist_exists(zhp->zfs_recvd_props, propname))
+ return (-1);
+ zfs_set_recvd_props_mode(zhp, &cookie);
+ err = zfs_prop_get(zhp, prop, propbuf, proplen,
+ NULL, NULL, 0, literal);
+ zfs_unset_recvd_props_mode(zhp, &cookie);
+ } else if (zfs_prop_userquota(propname)) {
+ return (-1);
+ } else {
+ nvlist_t *propval;
+ char *recvdval;
+ if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
+ propname, &propval) != 0)
+ return (-1);
+ verify(nvlist_lookup_string(propval, ZPROP_VALUE,
+ &recvdval) == 0);
+ (void) strlcpy(propbuf, recvdval, proplen);
+ }
+
+ return (err == 0 ? 0 : -1);
+}
+
/*
* Retrieve a property from the given object. If 'literal' is specified, then
* numbers are left as exact values. Otherwise, numbers are converted to a
@@ -1774,6 +1897,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
uint64_t val;
char *str;
const char *strval;
+ boolean_t received = zfs_is_recvd_props_mode(zhp);
/*
* Check to see if this property applies to our object
@@ -1781,6 +1905,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
return (-1);
+ if (received && zfs_prop_readonly(prop))
+ return (-1);
+
if (src)
*src = ZPROP_SRC_NONE;
@@ -1820,10 +1947,22 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
if (str[0] == '/') {
char buf[MAXPATHLEN];
char *root = buf;
- const char *relpath = zhp->zfs_name + strlen(source);
+ const char *relpath;
- if (relpath[0] == '/')
- relpath++;
+ /*
+ * If we inherit the mountpoint, even from a dataset
+ * with a received value, the source will be the path of
+ * the dataset we inherit from. If source is
+ * ZPROP_SOURCE_VAL_RECVD, the received value is not
+ * inherited.
+ */
+ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+ relpath = "";
+ } else {
+ relpath = zhp->zfs_name + strlen(source);
+ if (relpath[0] == '/')
+ relpath++;
+ }
if ((zpool_get_prop(zhp->zpool_hdl,
ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) ||
@@ -1902,8 +2041,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
case ZFS_PROP_COMPRESSRATIO:
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
- (void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
- val / 100, (longlong_t)val % 100);
+ (void) snprintf(propbuf, proplen, "%llu.%02llux",
+ (u_longlong_t)(val / 100),
+ (u_longlong_t)(val % 100));
break;
case ZFS_PROP_TYPE:
@@ -1948,6 +2088,48 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
(void) strlcpy(propbuf, zhp->zfs_name, proplen);
break;
+ case ZFS_PROP_MLSLABEL:
+ {
+#ifdef sun
+ m_label_t *new_sl = NULL;
+ char *ascii = NULL; /* human readable label */
+
+ (void) strlcpy(propbuf,
+ getprop_string(zhp, prop, &source), proplen);
+
+ if (literal || (strcasecmp(propbuf,
+ ZFS_MLSLABEL_DEFAULT) == 0))
+ break;
+
+ /*
+ * Try to translate the internal hex string to
+ * human-readable output. If there are any
+ * problems just use the hex string.
+ */
+
+ if (str_to_label(propbuf, &new_sl, MAC_LABEL,
+ L_NO_CORRECTION, NULL) == -1) {
+ m_label_free(new_sl);
+ break;
+ }
+
+ if (label_to_str(new_sl, &ascii, M_LABEL,
+ DEF_NAMES) != 0) {
+ if (ascii)
+ free(ascii);
+ m_label_free(new_sl);
+ break;
+ }
+ m_label_free(new_sl);
+
+ (void) strlcpy(propbuf, ascii, proplen);
+ free(ascii);
+#else /* !sun */
+ propbuf[0] = '\0';
+#endif /* !sun */
+ }
+ break;
+
default:
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
@@ -2044,14 +2226,11 @@ idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
char **domainp, idmap_rid_t *ridp)
{
#ifdef sun
- idmap_handle_t *idmap_hdl = NULL;
idmap_get_handle_t *get_hdl = NULL;
idmap_stat status;
int err = EINVAL;
- if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS)
- goto out;
- if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS)
+ if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
goto out;
if (isuser) {
@@ -2070,29 +2249,12 @@ idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
out:
if (get_hdl)
idmap_get_destroy(get_hdl);
- if (idmap_hdl)
- (void) idmap_fini(idmap_hdl);
return (err);
#else /* !sun */
assert(!"invalid code path");
#endif /* !sun */
}
-#ifndef sun
-/* Check if a string contains only digits */
-static int
-string_is_digits(char *cp)
-{
- int i;
-
- for(i = 0; i < strlen(cp); i++)
- if(!isdigit(cp[i]))
- return (0);
- return (1);
-}
-
-#endif /* !sun */
-
/*
* convert the propname into parameters needed by kernel
* Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
@@ -2131,7 +2293,6 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
* turned into S-1-domainID-RID.
*/
directory_error_t e;
-
if (zoned && getzoneid() == GLOBAL_ZONEID)
return (ENOENT);
if (isuser) {
@@ -2150,7 +2311,7 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
cp = numericsid;
/* will be further decoded below */
#else /* !sun */
- return (ENOENT);
+ return (ENOENT);
#endif /* !sun */
}
@@ -2169,15 +2330,7 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
}
if (errno != 0 || *end != '\0')
return (EINVAL);
-#ifdef sun
} else if (!isdigit(*cp)) {
-#else /* sun */
- /*
- * In FreeBSD user and group names can begin with a digit so treat
- * as a uid/gid if string contains digits only
- */
- } else if (!string_is_digits(cp)) {
-#endif /* sun */
/*
* It's a user/group name (eg "user") that needs to be
* turned into a uid/gid
@@ -2309,13 +2462,6 @@ top:
(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
- /*
- * FreeBSD compatibility with pre-v15 kernel module.
- * Ignore private dataset names.
- */
- if (strchr(zc->zc_name, '$') != NULL)
- rc = 0;
-
if (rc == -1) {
switch (errno) {
case ENOMEM:
@@ -2363,14 +2509,6 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
&zc)) == 0) {
-
- /*
- * FreeBSD compatibility with pre-v15 kernel module.
- * Ignore private dataset names.
- */
- if (strchr(zc.zc_name, '$') != NULL)
- continue;
-
/*
* Silently ignore errors, as the only plausible explanation is
* that the pool has since been removed.
@@ -2407,13 +2545,6 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
&zc)) == 0) {
- /*
- * FreeBSD compatibility with pre-v15 kernel module.
- * Ignore private dataset names.
- */
- if (strchr(zc.zc_name, '$') != NULL)
- continue;
-
if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
&zc)) == NULL) {
continue;
@@ -2443,6 +2574,27 @@ zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
}
/*
+ * Is one dataset name a child dataset of another?
+ *
+ * Needs to handle these cases:
+ * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo"
+ * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar"
+ * Descendant? No. No. No. Yes.
+ */
+static boolean_t
+is_descendant(const char *ds1, const char *ds2)
+{
+ size_t d1len = strlen(ds1);
+
+ /* ds2 can't be a descendant if it's smaller */
+ if (strlen(ds2) < d1len)
+ return (B_FALSE);
+
+ /* otherwise, compare strings and verify that there's a '/' char */
+ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
+}
+
+/*
* Given a complete name, return just the portion that refers to the parent.
* Can return NULL if this is a pool.
*/
@@ -2477,6 +2629,7 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
char *slash;
zfs_handle_t *zhp;
char errbuf[1024];
+ uint64_t is_zoned;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
@@ -2519,9 +2672,12 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
return (zfs_standard_error(hdl, errno, errbuf));
}
- *zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+ is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+ if (zoned != NULL)
+ *zoned = is_zoned;
+
/* we are in a non-global zone, but parent is in the global zone */
- if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) {
+ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
(void) zfs_standard_error(hdl, EPERM, errbuf);
zfs_close(zhp);
return (-1);
@@ -2653,11 +2809,10 @@ int
zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
{
int prefix;
- uint64_t zoned;
char *path_copy;
int rc;
- if (check_parents(hdl, path, &zoned, B_TRUE, &prefix) != 0)
+ if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
return (-1);
if ((path_copy = strdup(path)) != NULL) {
@@ -2771,18 +2926,6 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
/* create the dataset */
ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
- if (ret == 0 && type == ZFS_TYPE_VOLUME) {
- ret = zvol_create_link(hdl, path);
- if (ret) {
- (void) zfs_standard_error(hdl, errno,
- dgettext(TEXT_DOMAIN,
- "Volume successfully created, but device links "
- "were not created"));
- zcmd_free_nvlists(&zc);
- return (-1);
- }
- }
-
zcmd_free_nvlists(&zc);
/* check for failure */
@@ -2838,30 +2981,19 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
* isn't mounted, and that there are no active dependents.
*/
int
-zfs_destroy(zfs_handle_t *zhp)
+zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
{
zfs_cmd_t zc = { 0 };
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (ZFS_IS_VOLUME(zhp)) {
- /*
- * If user doesn't have permissions to unshare volume, then
- * abort the request. This would only happen for a
- * non-privileged user.
- */
- if (zfs_unshare_iscsi(zhp) != 0) {
- return (-1);
- }
-
- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
- return (-1);
-
zc.zc_objset_type = DMU_OST_ZVOL;
} else {
zc.zc_objset_type = DMU_OST_ZFS;
}
+ zc.zc_defer_destroy = defer;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) {
return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
@@ -2880,13 +3012,13 @@ struct destroydata {
};
static int
-zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
+zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
{
struct destroydata *dd = arg;
zfs_handle_t *szhp;
char name[ZFS_MAXNAMELEN];
boolean_t closezhp = dd->closezhp;
- int rv;
+ int rv = 0;
(void) strlcpy(name, zhp->zfs_name, sizeof (name));
(void) strlcat(name, "@", sizeof (name));
@@ -2898,17 +3030,9 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
zfs_close(szhp);
}
- if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
- (void) zvol_remove_link(zhp->zfs_hdl, name);
- /*
- * NB: this is simply a best-effort. We don't want to
- * return an error, because then we wouldn't visit all
- * the volumes.
- */
- }
-
dd->closezhp = B_TRUE;
- rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
+ if (!dd->gotone)
+ rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
if (closezhp)
zfs_close(zhp);
return (rv);
@@ -2918,14 +3042,14 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
* Destroys all snapshots with the given name in zhp & descendants.
*/
int
-zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
+zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
{
zfs_cmd_t zc = { 0 };
int ret;
struct destroydata dd = { 0 };
dd.snapname = snapname;
- (void) zfs_remove_link_cb(zhp, &dd);
+ (void) zfs_check_snap_cb(zhp, &dd);
if (!dd.gotone) {
return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
@@ -2935,6 +3059,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ zc.zc_defer_destroy = defer;
ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc);
if (ret != 0) {
@@ -3042,70 +3167,11 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
return (zfs_standard_error(zhp->zfs_hdl, errno,
errbuf));
}
- } else if (ZFS_IS_VOLUME(zhp)) {
- ret = zvol_create_link(zhp->zfs_hdl, target);
}
return (ret);
}
-typedef struct promote_data {
- char cb_mountpoint[MAXPATHLEN];
- const char *cb_target;
- const char *cb_errbuf;
- uint64_t cb_pivot_txg;
-} promote_data_t;
-
-static int
-promote_snap_cb(zfs_handle_t *zhp, void *data)
-{
- promote_data_t *pd = data;
- zfs_handle_t *szhp;
- char snapname[MAXPATHLEN];
- int rv = 0;
-
- /* We don't care about snapshots after the pivot point */
- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
- zfs_close(zhp);
- return (0);
- }
-
- /* Remove the device link if it's a zvol. */
- if (ZFS_IS_VOLUME(zhp))
- (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
-
- /* Check for conflicting names */
- (void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
- (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
- szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
- if (szhp != NULL) {
- zfs_close(szhp);
- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
- "snapshot name '%s' from origin \n"
- "conflicts with '%s' from target"),
- zhp->zfs_name, snapname);
- rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
- }
- zfs_close(zhp);
- return (rv);
-}
-
-static int
-promote_snap_done_cb(zfs_handle_t *zhp, void *data)
-{
- promote_data_t *pd = data;
-
- /* We don't care about snapshots after the pivot point */
- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
- /* Create the device link if it's a zvol. */
- if (ZFS_IS_VOLUME(zhp))
- (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
- }
-
- zfs_close(zhp);
- return (0);
-}
-
/*
* Promotes the given clone fs to be the clone parent.
*/
@@ -3115,10 +3181,7 @@ zfs_promote(zfs_handle_t *zhp)
libzfs_handle_t *hdl = zhp->zfs_hdl;
zfs_cmd_t zc = { 0 };
char parent[MAXPATHLEN];
- char *cp;
int ret;
- zfs_handle_t *pzhp;
- promote_data_t pd;
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -3136,29 +3199,7 @@ zfs_promote(zfs_handle_t *zhp)
"not a cloned filesystem"));
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
- cp = strchr(parent, '@');
- *cp = '\0';
- /* Walk the snapshots we will be moving */
- pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
- if (pzhp == NULL)
- return (-1);
- pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
- zfs_close(pzhp);
- pd.cb_target = zhp->zfs_name;
- pd.cb_errbuf = errbuf;
- pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET);
- if (pzhp == NULL)
- return (-1);
- (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
- sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
- ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
- if (ret != 0) {
- zfs_close(pzhp);
- return (-1);
- }
-
- /* issue the ioctl */
(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
sizeof (zc.zc_value));
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -3167,62 +3208,18 @@ zfs_promote(zfs_handle_t *zhp)
if (ret != 0) {
int save_errno = errno;
- (void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
- zfs_close(pzhp);
-
switch (save_errno) {
case EEXIST:
- /*
- * There is a conflicting snapshot name. We
- * should have caught this above, but they could
- * have renamed something in the mean time.
- */
+ /* There is a conflicting snapshot name. */
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "conflicting snapshot name from parent '%s'"),
- parent);
+ "conflicting snapshot '%s' from parent '%s'"),
+ zc.zc_string, parent);
return (zfs_error(hdl, EZFS_EXISTS, errbuf));
default:
return (zfs_standard_error(hdl, save_errno, errbuf));
}
- } else {
- (void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
- }
-
- zfs_close(pzhp);
- return (ret);
-}
-
-struct createdata {
- const char *cd_snapname;
- int cd_ifexists;
-};
-
-static int
-zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
-{
- struct createdata *cd = arg;
- int ret;
-
- if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
- char name[MAXPATHLEN];
-
- (void) strlcpy(name, zhp->zfs_name, sizeof (name));
- (void) strlcat(name, "@", sizeof (name));
- (void) strlcat(name, cd->cd_snapname, sizeof (name));
- (void) zvol_create_link_common(zhp->zfs_hdl, name,
- cd->cd_ifexists);
- /*
- * NB: this is simply a best-effort. We don't want to
- * return an error, because then we wouldn't visit all
- * the volumes.
- */
}
-
- ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
-
- zfs_close(zhp);
-
return (ret);
}
@@ -3286,31 +3283,11 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
* if it was recursive, the one that actually failed will be in
* zc.zc_name.
*/
- if (ret != 0)
+ if (ret != 0) {
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-
- if (ret == 0 && recursive) {
- struct createdata cd;
-
- cd.cd_snapname = delim + 1;
- cd.cd_ifexists = B_FALSE;
- (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
- }
- if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
- ret = zvol_create_link(zhp->zfs_hdl, path);
- if (ret != 0) {
- (void) zfs_standard_error(hdl, errno,
- dgettext(TEXT_DOMAIN,
- "Volume successfully snapshotted, but device links "
- "were not created"));
- zfs_close(zhp);
- return (-1);
- }
- }
-
- if (ret != 0)
(void) zfs_standard_error(hdl, errno, errbuf);
+ }
zfs_close(zhp);
@@ -3350,7 +3327,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
logstr = zhp->zfs_hdl->libzfs_log_str;
zhp->zfs_hdl->libzfs_log_str = NULL;
- cbp->cb_error |= zfs_destroy(zhp);
+ cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
zhp->zfs_hdl->libzfs_log_str = logstr;
}
} else {
@@ -3364,7 +3341,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
zfs_close(zhp);
return (0);
}
- if (zfs_destroy(zhp) != 0)
+ if (zfs_destroy(zhp, B_FALSE) != 0)
cbp->cb_error = B_TRUE;
else
changelist_remove(clp, zhp->zfs_name);
@@ -3413,8 +3390,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
*/
if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
- return (-1);
if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
return (-1);
old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
@@ -3452,10 +3427,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
*/
if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
(zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
- if (err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name)) {
- zfs_close(zhp);
- return (err);
- }
if (restore_resv) {
new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
if (old_volsize != new_volsize)
@@ -3570,14 +3541,11 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
- uint64_t unused;
/* validate parents */
- if (check_parents(hdl, target, &unused, B_FALSE, NULL) != 0)
+ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
return (-1);
- (void) parent_name(target, parent, sizeof (parent));
-
/* make sure we're in the same pool */
verify((delim = strchr(target, '/')) != NULL);
if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
@@ -3588,10 +3556,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
}
/* new name cannot be a child of the current dataset name */
- if (strncmp(parent, zhp->zfs_name,
- strlen(zhp->zfs_name)) == 0) {
+ if (is_descendant(zhp->zfs_name, target)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "New dataset name cannot be a descendent of "
+ "New dataset name cannot be a descendant of "
"current dataset name"));
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
}
@@ -3608,7 +3575,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
}
if (recursive) {
- struct destroydata dd;
parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
if (parentname == NULL) {
@@ -3623,15 +3589,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
goto error;
}
- dd.snapname = delim + 1;
- dd.gotone = B_FALSE;
- dd.closezhp = B_TRUE;
-
- /* We remove any zvol links prior to renaming them */
- ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
- if (ret) {
- goto error;
- }
} else {
if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
return (-1);
@@ -3679,27 +3636,10 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
* On failure, we still want to remount any filesystems that
* were previously mounted, so we don't alter the system state.
*/
- if (recursive) {
- struct createdata cd;
-
- /* only create links for datasets that had existed */
- cd.cd_snapname = delim + 1;
- cd.cd_ifexists = B_TRUE;
- (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
- &cd);
- } else {
+ if (!recursive)
(void) changelist_postfix(cl);
- }
} else {
- if (recursive) {
- struct createdata cd;
-
- /* only create links for datasets that had existed */
- cd.cd_snapname = strchr(target, '@') + 1;
- cd.cd_ifexists = B_TRUE;
- ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
- &cd);
- } else {
+ if (!recursive) {
changelist_rename(cl, zfs_get_name(zhp), target);
ret = changelist_postfix(cl);
}
@@ -3718,147 +3658,19 @@ error:
return (ret);
}
-/*
- * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
- * poke devfsadm to create the /dev link, and then wait for the link to appear.
- */
-int
-zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
-{
- return (zvol_create_link_common(hdl, dataset, B_FALSE));
-}
-
-static int
-zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
-{
- zfs_cmd_t zc = { 0 };
-#if 0
- di_devlink_handle_t dhdl;
- priv_set_t *priv_effective;
- int privileged;
-#endif
-
- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
- /*
- * Issue the appropriate ioctl.
- */
- if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
- switch (errno) {
- case EEXIST:
- /*
- * Silently ignore the case where the link already
- * exists. This allows 'zfs volinit' to be run multiple
- * times without errors.
- */
- return (0);
-
- case ENOENT:
- /*
- * Dataset does not exist in the kernel. If we
- * don't care (see zfs_rename), then ignore the
- * error quietly.
- */
- if (ifexists) {
- return (0);
- }
-
- /* FALLTHROUGH */
-
- default:
- return (zfs_standard_error_fmt(hdl, errno,
- dgettext(TEXT_DOMAIN, "cannot create device links "
- "for '%s'"), dataset));
- }
- }
-
-#if 0
- /*
- * If privileged call devfsadm and wait for the links to
- * magically appear.
- * Otherwise, print out an informational message.
- */
-
- priv_effective = priv_allocset();
- (void) getppriv(PRIV_EFFECTIVE, priv_effective);
- privileged = (priv_isfullset(priv_effective) == B_TRUE);
- priv_freeset(priv_effective);
-
- if (privileged) {
- if ((dhdl = di_devlink_init(ZFS_DRIVER,
- DI_MAKE_LINK)) == NULL) {
- zfs_error_aux(hdl, strerror(errno));
- (void) zfs_error_fmt(hdl, errno,
- dgettext(TEXT_DOMAIN, "cannot create device links "
- "for '%s'"), dataset);
- (void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
- return (-1);
- } else {
- (void) di_devlink_fini(&dhdl);
- }
- } else {
- char pathname[MAXPATHLEN];
- struct stat64 statbuf;
- int i;
-
-#define MAX_WAIT 10
-
- /*
- * This is the poor mans way of waiting for the link
- * to show up. If after 10 seconds we still don't
- * have it, then print out a message.
- */
- (void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s",
- dataset);
-
- for (i = 0; i != MAX_WAIT; i++) {
- if (stat64(pathname, &statbuf) == 0)
- break;
- (void) sleep(1);
- }
- if (i == MAX_WAIT)
- (void) printf(gettext("%s may not be immediately "
- "available\n"), pathname);
- }
-#endif
-
- return (0);
-}
-
-/*
- * Remove a minor node for the given zvol and the associated /dev links.
- */
-int
-zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
+nvlist_t *
+zfs_get_user_props(zfs_handle_t *zhp)
{
- zfs_cmd_t zc = { 0 };
-
- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
- if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
- switch (errno) {
- case ENXIO:
- /*
- * Silently ignore the case where the link no longer
- * exists, so that 'zfs volfini' can be run multiple
- * times without errors.
- */
- return (0);
-
- default:
- return (zfs_standard_error_fmt(hdl, errno,
- dgettext(TEXT_DOMAIN, "cannot remove device "
- "links for '%s'"), dataset));
- }
- }
-
- return (0);
+ return (zhp->zfs_user_props);
}
nvlist_t *
-zfs_get_user_props(zfs_handle_t *zhp)
+zfs_get_recvd_props(zfs_handle_t *zhp)
{
- return (zhp->zfs_user_props);
+ if (zhp->zfs_recvd_props == NULL)
+ if (get_recvd_props_ioctl(zhp) != 0)
+ return (NULL);
+ return (zhp->zfs_recvd_props);
}
/*
@@ -3870,10 +3682,12 @@ zfs_get_user_props(zfs_handle_t *zhp)
* for new unique user properties and add them to the list.
*
* - For non fixed-width properties, keep track of the maximum width seen
- * so that we can size the column appropriately.
+ * so that we can size the column appropriately. If the user has
+ * requested received property values, we also need to compute the width
+ * of the RECEIVED column.
*/
int
-zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received)
{
libzfs_handle_t *hdl = zhp->zfs_hdl;
zprop_list_t *entry;
@@ -3944,66 +3758,30 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
if (strlen(buf) > entry->pl_width)
entry->pl_width = strlen(buf);
}
- } else if (nvlist_lookup_nvlist(userprops,
- entry->pl_user_prop, &propval) == 0) {
- verify(nvlist_lookup_string(propval,
- ZPROP_VALUE, &strval) == 0);
- if (strlen(strval) > entry->pl_width)
- entry->pl_width = strlen(strval);
+ if (received && zfs_prop_get_recvd(zhp,
+ zfs_prop_to_name(entry->pl_prop),
+ buf, sizeof (buf), B_FALSE) == 0)
+ if (strlen(buf) > entry->pl_recvd_width)
+ entry->pl_recvd_width = strlen(buf);
+ } else {
+ if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
+ &propval) == 0) {
+ verify(nvlist_lookup_string(propval,
+ ZPROP_VALUE, &strval) == 0);
+ if (strlen(strval) > entry->pl_width)
+ entry->pl_width = strlen(strval);
+ }
+ if (received && zfs_prop_get_recvd(zhp,
+ entry->pl_user_prop,
+ buf, sizeof (buf), B_FALSE) == 0)
+ if (strlen(buf) > entry->pl_recvd_width)
+ entry->pl_recvd_width = strlen(buf);
}
}
return (0);
}
-#ifdef TODO
-int
-zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred)
-{
- zfs_cmd_t zc = { 0 };
- nvlist_t *nvp;
- gid_t gid;
- uid_t uid;
- const gid_t *groups;
- int group_cnt;
- int error;
-
- if (nvlist_alloc(&nvp, NV_UNIQUE_NAME, 0) != 0)
- return (no_memory(hdl));
-
- uid = ucred_geteuid(cred);
- gid = ucred_getegid(cred);
- group_cnt = ucred_getgroups(cred, &groups);
-
- if (uid == (uid_t)-1 || gid == (uid_t)-1 || group_cnt == (uid_t)-1)
- return (1);
-
- if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_UID, uid) != 0) {
- nvlist_free(nvp);
- return (1);
- }
-
- if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_GID, gid) != 0) {
- nvlist_free(nvp);
- return (1);
- }
-
- if (nvlist_add_uint32_array(nvp,
- ZFS_DELEG_PERM_GROUPS, (uint32_t *)groups, group_cnt) != 0) {
- nvlist_free(nvp);
- return (1);
- }
- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
- if (zcmd_write_src_nvlist(hdl, &zc, nvp))
- return (-1);
-
- error = ioctl(hdl->libzfs_fd, ZFS_IOC_ISCSI_PERM_CHECK, &zc);
- nvlist_free(nvp);
- return (error);
-}
-#endif
-
int
zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
char *resource, void *export, void *sharetab,
@@ -4042,9 +3820,11 @@ zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
/*
- * We leave user:props in the nvlist, so there will be
- * some ZPROP_INVAL. To be extra safe, don't prune
- * those.
+ * User properties will result in ZPROP_INVAL, and since we
+ * only know how to prune standard ZFS properties, we always
+ * leave these in the list. This can also happen if we
+ * encounter an unknown DSL property (when running older
+ * software, for example).
*/
if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
(void) nvlist_remove(zhp->zfs_props,
@@ -4173,6 +3953,331 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
return (error);
}
+int
+zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
+ boolean_t recursive, boolean_t temphold, boolean_t enoent_ok,
+ int cleanup_fd, uint64_t dsobj, uint64_t createtxg)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+ ASSERT(!recursive || dsobj == 0);
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+ >= sizeof (zc.zc_string))
+ return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
+ zc.zc_cookie = recursive;
+ zc.zc_temphold = temphold;
+ zc.zc_cleanup_fd = cleanup_fd;
+ zc.zc_sendobj = dsobj;
+ zc.zc_createtxg = createtxg;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ /*
+ * if it was recursive, the one that actually failed will be in
+ * zc.zc_name.
+ */
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot hold '%s@%s'"), zc.zc_name, snapname);
+ switch (errno) {
+ case E2BIG:
+ /*
+ * Temporary tags wind up having the ds object id
+ * prepended. So even if we passed the length check
+ * above, it's still possible for the tag to wind
+ * up being slightly too long.
+ */
+ return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+ case EINVAL:
+ return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+ case EEXIST:
+ return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+ case ENOENT:
+ if (enoent_ok)
+ return (ENOENT);
+ /* FALLTHROUGH */
+ default:
+ return (zfs_standard_error_fmt(hdl, errno, errbuf));
+ }
+ }
+
+ return (0);
+}
+
+int
+zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
+ boolean_t recursive)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+ >= sizeof (zc.zc_string))
+ return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
+ zc.zc_cookie = recursive;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ /*
+ * if it was recursive, the one that actually failed will be in
+ * zc.zc_name.
+ */
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
+ snapname);
+ switch (errno) {
+ case ESRCH:
+ return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+ case EINVAL:
+ return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+ default:
+ return (zfs_standard_error_fmt(hdl, errno, errbuf));
+ }
+ }
+
+ return (0);
+}
+
+int
+zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+ int nvsz = 2048;
+ void *nvbuf;
+ int err = 0;
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+ zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+tryagain:
+
+ nvbuf = malloc(nvsz);
+ if (nvbuf == NULL) {
+ err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+ goto out;
+ }
+
+ zc.zc_nvlist_dst_size = nvsz;
+ zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+ if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) {
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
+ zc.zc_name);
+ switch (errno) {
+ case ENOMEM:
+ free(nvbuf);
+ nvsz = zc.zc_nvlist_dst_size;
+ goto tryagain;
+
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+ break;
+ case EINVAL:
+ err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+ break;
+ case ENOENT:
+ err = zfs_error(hdl, EZFS_NOENT, errbuf);
+ break;
+ default:
+ err = zfs_standard_error_fmt(hdl, errno, errbuf);
+ break;
+ }
+ } else {
+ /* success */
+ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+ if (rc) {
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(
+ TEXT_DOMAIN, "cannot get permissions on '%s'"),
+ zc.zc_name);
+ err = zfs_standard_error_fmt(hdl, rc, errbuf);
+ }
+ }
+
+ free(nvbuf);
+out:
+ return (err);
+}
+
+int
+zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+ char *nvbuf;
+ char errbuf[ZFS_MAXNAMELEN+32];
+ size_t nvsz;
+ int err;
+
+ assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+ zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+ err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
+ assert(err == 0);
+
+ nvbuf = malloc(nvsz);
+
+ err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
+ assert(err == 0);
+
+ zc.zc_nvlist_src_size = nvsz;
+ zc.zc_nvlist_src = (uintptr_t)nvbuf;
+ zc.zc_perm_action = un;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+ if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
+ zc.zc_name);
+ switch (errno) {
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+ break;
+ case EINVAL:
+ err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+ break;
+ case ENOENT:
+ err = zfs_error(hdl, EZFS_NOENT, errbuf);
+ break;
+ default:
+ err = zfs_standard_error_fmt(hdl, errno, errbuf);
+ break;
+ }
+ }
+
+ free(nvbuf);
+
+ return (err);
+}
+
+int
+zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+ int nvsz = 2048;
+ void *nvbuf;
+ int err = 0;
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+tryagain:
+
+ nvbuf = malloc(nvsz);
+ if (nvbuf == NULL) {
+ err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+ goto out;
+ }
+
+ zc.zc_nvlist_dst_size = nvsz;
+ zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+ if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) {
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+ zc.zc_name);
+ switch (errno) {
+ case ENOMEM:
+ free(nvbuf);
+ nvsz = zc.zc_nvlist_dst_size;
+ goto tryagain;
+
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+ break;
+ case EINVAL:
+ err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+ break;
+ case ENOENT:
+ err = zfs_error(hdl, EZFS_NOENT, errbuf);
+ break;
+ default:
+ err = zfs_standard_error_fmt(hdl, errno, errbuf);
+ break;
+ }
+ } else {
+ /* success */
+ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+ if (rc) {
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+ zc.zc_name);
+ err = zfs_standard_error_fmt(hdl, rc, errbuf);
+ }
+ }
+
+ free(nvbuf);
+out:
+ return (err);
+}
+
+uint64_t
+zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
+{
+ uint64_t numdb;
+ uint64_t nblocks, volblocksize;
+ int ncopies;
+ char *strval;
+
+ if (nvlist_lookup_string(props,
+ zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
+ ncopies = atoi(strval);
+ else
+ ncopies = 1;
+ if (nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize) != 0)
+ volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+ nblocks = volsize/volblocksize;
+ /* start with metadnode L0-L6 */
+ numdb = 7;
+ /* calculate number of indirects */
+ while (nblocks > 1) {
+ nblocks += DNODES_PER_LEVEL - 1;
+ nblocks /= DNODES_PER_LEVEL;
+ numdb += nblocks;
+ }
+ numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
+ volsize *= ncopies;
+ /*
+ * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
+ * compressed, but in practice they compress down to about
+ * 1100 bytes
+ */
+ numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
+ volsize += numdb;
+ return (volsize);
+}
+
/*
* Attach/detach the given filesystem to/from the given jail.
*/
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c
new file mode 100644
index 000000000000..ae84285a9bb5
--- /dev/null
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c
@@ -0,0 +1,832 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * zfs diff support
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <libintl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/zfs_ioctl.h>
+#include <libzfs.h>
+#include "libzfs_impl.h"
+
+#define ZDIFF_SNAPDIR "/.zfs/snapshot/"
+#define ZDIFF_SHARESDIR "/.zfs/shares/"
+#define ZDIFF_PREFIX "zfs-diff-%d"
+
+#define ZDIFF_ADDED '+'
+#define ZDIFF_MODIFIED 'M'
+#define ZDIFF_REMOVED '-'
+#define ZDIFF_RENAMED 'R'
+
+static boolean_t
+do_name_cmp(const char *fpath, const char *tpath)
+{
+ char *fname, *tname;
+ fname = strrchr(fpath, '/') + 1;
+ tname = strrchr(tpath, '/') + 1;
+ return (strcmp(fname, tname) == 0);
+}
+
+typedef struct differ_info {
+ zfs_handle_t *zhp;
+ char *fromsnap;
+ char *frommnt;
+ char *tosnap;
+ char *tomnt;
+ char *ds;
+ char *dsmnt;
+ char *tmpsnap;
+ char errbuf[1024];
+ boolean_t isclone;
+ boolean_t scripted;
+ boolean_t classify;
+ boolean_t timestamped;
+ uint64_t shares;
+ int zerr;
+ int cleanupfd;
+ int outputfd;
+ int datafd;
+} differ_info_t;
+
+/*
+ * Given a {dsname, object id}, get the object path
+ */
+static int
+get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj,
+ char *pn, int maxlen, zfs_stat_t *sb)
+{
+ zfs_cmd_t zc = { 0 };
+ int error;
+
+ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
+ zc.zc_obj = obj;
+
+ errno = 0;
+ error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc);
+ di->zerr = errno;
+
+ /* we can get stats even if we failed to get a path */
+ (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t));
+ if (error == 0) {
+ ASSERT(di->zerr == 0);
+ (void) strlcpy(pn, zc.zc_value, maxlen);
+ return (0);
+ }
+
+ if (di->zerr == EPERM) {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "The sys_config privilege or diff delegated permission "
+ "is needed\nto discover path names"));
+ return (-1);
+ } else {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Unable to determine path or stats for "
+ "object %lld in %s"), obj, dsname);
+ return (-1);
+ }
+}
+
+/*
+ * stream_bytes
+ *
+ * Prints a file name out a character at a time. If the character is
+ * not in the range of what we consider "printable" ASCII, display it
+ * as an escaped 3-digit octal value. ASCII values less than a space
+ * are all control characters and we declare the upper end as the
+ * DELete character. This also is the last 7-bit ASCII character.
+ * We choose to treat all 8-bit ASCII as not printable for this
+ * application.
+ */
+static void
+stream_bytes(FILE *fp, const char *string)
+{
+ while (*string) {
+ if (*string > ' ' && *string != '\\' && *string < '\177')
+ (void) fprintf(fp, "%c", *string++);
+ else
+ (void) fprintf(fp, "\\%03o", *string++);
+ }
+}
+
+static void
+print_what(FILE *fp, mode_t what)
+{
+ char symbol;
+
+ switch (what & S_IFMT) {
+ case S_IFBLK:
+ symbol = 'B';
+ break;
+ case S_IFCHR:
+ symbol = 'C';
+ break;
+ case S_IFDIR:
+ symbol = '/';
+ break;
+#ifdef S_IFDOOR
+ case S_IFDOOR:
+ symbol = '>';
+ break;
+#endif
+ case S_IFIFO:
+ symbol = '|';
+ break;
+ case S_IFLNK:
+ symbol = '@';
+ break;
+#ifdef S_IFPORT
+ case S_IFPORT:
+ symbol = 'P';
+ break;
+#endif
+ case S_IFSOCK:
+ symbol = '=';
+ break;
+ case S_IFREG:
+ symbol = 'F';
+ break;
+ default:
+ symbol = '?';
+ break;
+ }
+ (void) fprintf(fp, "%c", symbol);
+}
+
+static void
+print_cmn(FILE *fp, differ_info_t *di, const char *file)
+{
+ stream_bytes(fp, di->dsmnt);
+ stream_bytes(fp, file);
+}
+
+static void
+print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new,
+ zfs_stat_t *isb)
+{
+ if (di->timestamped)
+ (void) fprintf(fp, "%10lld.%09lld\t",
+ (longlong_t)isb->zs_ctime[0],
+ (longlong_t)isb->zs_ctime[1]);
+ (void) fprintf(fp, "%c\t", ZDIFF_RENAMED);
+ if (di->classify) {
+ print_what(fp, isb->zs_mode);
+ (void) fprintf(fp, "\t");
+ }
+ print_cmn(fp, di, old);
+ if (di->scripted)
+ (void) fprintf(fp, "\t");
+ else
+ (void) fprintf(fp, " -> ");
+ print_cmn(fp, di, new);
+ (void) fprintf(fp, "\n");
+}
+
+static void
+print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file,
+ zfs_stat_t *isb)
+{
+ if (di->timestamped)
+ (void) fprintf(fp, "%10lld.%09lld\t",
+ (longlong_t)isb->zs_ctime[0],
+ (longlong_t)isb->zs_ctime[1]);
+ (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED);
+ if (di->classify) {
+ print_what(fp, isb->zs_mode);
+ (void) fprintf(fp, "\t");
+ }
+ print_cmn(fp, di, file);
+ (void) fprintf(fp, "\t(%+d)", delta);
+ (void) fprintf(fp, "\n");
+}
+
+static void
+print_file(FILE *fp, differ_info_t *di, char type, const char *file,
+ zfs_stat_t *isb)
+{
+ if (di->timestamped)
+ (void) fprintf(fp, "%10lld.%09lld\t",
+ (longlong_t)isb->zs_ctime[0],
+ (longlong_t)isb->zs_ctime[1]);
+ (void) fprintf(fp, "%c\t", type);
+ if (di->classify) {
+ print_what(fp, isb->zs_mode);
+ (void) fprintf(fp, "\t");
+ }
+ print_cmn(fp, di, file);
+ (void) fprintf(fp, "\n");
+}
+
+static int
+write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj)
+{
+ struct zfs_stat fsb, tsb;
+ boolean_t same_name;
+ mode_t fmode, tmode;
+ char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN];
+ int fobjerr, tobjerr;
+ int change;
+
+ if (dobj == di->shares)
+ return (0);
+
+ /*
+ * Check the from and to snapshots for info on the object. If
+ * we get ENOENT, then the object just didn't exist in that
+ * snapshot. If we get ENOTSUP, then we tried to get
+ * info on a non-ZPL object, which we don't care about anyway.
+ */
+ fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname,
+ MAXPATHLEN, &fsb);
+ if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+ return (-1);
+
+ tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname,
+ MAXPATHLEN, &tsb);
+ if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+ return (-1);
+
+ /*
+ * Unallocated object sharing the same meta dnode block
+ */
+ if (fobjerr && tobjerr) {
+ ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP);
+ di->zerr = 0;
+ return (0);
+ }
+
+ di->zerr = 0; /* negate get_stats_for_obj() from side that failed */
+ fmode = fsb.zs_mode & S_IFMT;
+ tmode = tsb.zs_mode & S_IFMT;
+ if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 ||
+ tsb.zs_links == 0)
+ change = 0;
+ else
+ change = tsb.zs_links - fsb.zs_links;
+
+ if (fobjerr) {
+ if (change) {
+ print_link_change(fp, di, change, tobjname, &tsb);
+ return (0);
+ }
+ print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+ return (0);
+ } else if (tobjerr) {
+ if (change) {
+ print_link_change(fp, di, change, fobjname, &fsb);
+ return (0);
+ }
+ print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+ return (0);
+ }
+
+ if (fmode != tmode && fsb.zs_gen == tsb.zs_gen)
+ tsb.zs_gen++; /* Force a generational difference */
+ same_name = do_name_cmp(fobjname, tobjname);
+
+ /* Simple modification or no change */
+ if (fsb.zs_gen == tsb.zs_gen) {
+ /* No apparent changes. Could we assert !this? */
+ if (fsb.zs_ctime[0] == tsb.zs_ctime[0] &&
+ fsb.zs_ctime[1] == tsb.zs_ctime[1])
+ return (0);
+ if (change) {
+ print_link_change(fp, di, change,
+ change > 0 ? fobjname : tobjname, &tsb);
+ } else if (same_name) {
+ print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb);
+ } else {
+ print_rename(fp, di, fobjname, tobjname, &tsb);
+ }
+ return (0);
+ } else {
+ /* file re-created or object re-used */
+ print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+ print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+ return (0);
+ }
+}
+
+static int
+write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+ uint64_t o;
+ int err;
+
+ for (o = dr->ddr_first; o <= dr->ddr_last; o++) {
+ if (err = write_inuse_diffs_one(fp, di, o))
+ return (err);
+ }
+ return (0);
+}
+
+static int
+describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf,
+ int maxlen)
+{
+ struct zfs_stat sb;
+
+ if (get_stats_for_obj(di, di->fromsnap, object, namebuf,
+ maxlen, &sb) != 0) {
+ /* Let it slide, if in the delete queue on from side */
+ if (di->zerr == ENOENT && sb.zs_links == 0) {
+ di->zerr = 0;
+ return (0);
+ }
+ return (-1);
+ }
+
+ print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb);
+ return (0);
+}
+
+static int
+write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *lhdl = di->zhp->zfs_hdl;
+ char fobjname[MAXPATHLEN];
+
+ (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name));
+ zc.zc_obj = dr->ddr_first - 1;
+
+ ASSERT(di->zerr == 0);
+
+ while (zc.zc_obj < dr->ddr_last) {
+ int err;
+
+ err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc);
+ if (err == 0) {
+ if (zc.zc_obj == di->shares) {
+ zc.zc_obj++;
+ continue;
+ }
+ if (zc.zc_obj > dr->ddr_last) {
+ break;
+ }
+ err = describe_free(fp, di, zc.zc_obj, fobjname,
+ MAXPATHLEN);
+ if (err)
+ break;
+ } else if (errno == ESRCH) {
+ break;
+ } else {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "next allocated object (> %lld) find failure"),
+ zc.zc_obj);
+ di->zerr = errno;
+ break;
+ }
+ }
+ if (di->zerr)
+ return (-1);
+ return (0);
+}
+
+static void *
+differ(void *arg)
+{
+ differ_info_t *di = arg;
+ dmu_diff_record_t dr;
+ FILE *ofp;
+ int err = 0;
+
+ if ((ofp = fdopen(di->outputfd, "w")) == NULL) {
+ di->zerr = errno;
+ (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf));
+ (void) close(di->datafd);
+ return ((void *)-1);
+ }
+
+ for (;;) {
+ char *cp = (char *)&dr;
+ int len = sizeof (dr);
+ int rv;
+
+ do {
+ rv = read(di->datafd, cp, len);
+ cp += rv;
+ len -= rv;
+ } while (len > 0 && rv > 0);
+
+ if (rv < 0 || (rv == 0 && len != sizeof (dr))) {
+ di->zerr = EPIPE;
+ break;
+ } else if (rv == 0) {
+ /* end of file at a natural breaking point */
+ break;
+ }
+
+ switch (dr.ddr_type) {
+ case DDR_FREE:
+ err = write_free_diffs(ofp, di, &dr);
+ break;
+ case DDR_INUSE:
+ err = write_inuse_diffs(ofp, di, &dr);
+ break;
+ default:
+ di->zerr = EPIPE;
+ break;
+ }
+
+ if (err || di->zerr)
+ break;
+ }
+
+ (void) fclose(ofp);
+ (void) close(di->datafd);
+ if (err)
+ return ((void *)-1);
+ if (di->zerr) {
+ ASSERT(di->zerr == EINVAL);
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Internal error: bad data from diff IOCTL"));
+ return ((void *)-1);
+ }
+ return ((void *)0);
+}
+
+static int
+find_shares_object(differ_info_t *di)
+{
+ char fullpath[MAXPATHLEN];
+ struct stat64 sb = { 0 };
+
+ (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN);
+ (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN);
+
+ if (stat64(fullpath, &sb) != 0) {
+#ifdef sun
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath);
+ return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf));
+#else
+ return (0);
+#endif
+ }
+
+ di->shares = (uint64_t)sb.st_ino;
+ return (0);
+}
+
+static int
+make_temp_snapshot(differ_info_t *di)
+{
+ libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+ zfs_cmd_t zc = { 0 };
+
+ (void) snprintf(zc.zc_value, sizeof (zc.zc_value),
+ ZDIFF_PREFIX, getpid());
+ (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name));
+ zc.zc_cleanup_fd = di->cleanupfd;
+
+ if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) {
+ int err = errno;
+ if (err == EPERM) {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN, "The diff delegated "
+ "permission is needed in order\nto create a "
+ "just-in-time snapshot for diffing\n"));
+ return (zfs_error(hdl, EZFS_DIFF, di->errbuf));
+ } else {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN, "Cannot create just-in-time "
+ "snapshot of '%s'"), zc.zc_name);
+ return (zfs_standard_error(hdl, err, di->errbuf));
+ }
+ }
+
+ di->tmpsnap = zfs_strdup(hdl, zc.zc_value);
+ di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap);
+ return (0);
+}
+
+static void
+teardown_differ_info(differ_info_t *di)
+{
+ free(di->ds);
+ free(di->dsmnt);
+ free(di->fromsnap);
+ free(di->frommnt);
+ free(di->tosnap);
+ free(di->tmpsnap);
+ free(di->tomnt);
+ (void) close(di->cleanupfd);
+}
+
+static int
+get_snapshot_names(differ_info_t *di, const char *fromsnap,
+ const char *tosnap)
+{
+ libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+ char *atptrf = NULL;
+ char *atptrt = NULL;
+ int fdslen, fsnlen;
+ int tdslen, tsnlen;
+
+ /*
+ * Can accept
+ * dataset@snap1
+ * dataset@snap1 dataset@snap2
+ * dataset@snap1 @snap2
+ * dataset@snap1 dataset
+ * @snap1 dataset@snap2
+ */
+ if (tosnap == NULL) {
+ /* only a from snapshot given, must be valid */
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Badly formed snapshot name %s"), fromsnap);
+
+ if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT,
+ B_FALSE)) {
+ return (zfs_error(hdl, EZFS_INVALIDNAME,
+ di->errbuf));
+ }
+
+ atptrf = strchr(fromsnap, '@');
+ ASSERT(atptrf != NULL);
+ fdslen = atptrf - fromsnap;
+
+ di->fromsnap = zfs_strdup(hdl, fromsnap);
+ di->ds = zfs_strdup(hdl, fromsnap);
+ di->ds[fdslen] = '\0';
+
+ /* the to snap will be a just-in-time snap of the head */
+ return (make_temp_snapshot(di));
+ }
+
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Unable to determine which snapshots to compare"));
+
+ atptrf = strchr(fromsnap, '@');
+ atptrt = strchr(tosnap, '@');
+ fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap);
+ tdslen = atptrt ? atptrt - tosnap : strlen(tosnap);
+ fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */
+ tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */
+
+ if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) ||
+ (fsnlen == 0 && tsnlen == 0)) {
+ return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+ } else if ((fdslen > 0 && tdslen > 0) &&
+ ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) {
+ /*
+ * not the same dataset name, might be okay if
+ * tosnap is a clone of a fromsnap descendant.
+ */
+ char origin[ZFS_MAXNAMELEN];
+ zprop_source_t src;
+ zfs_handle_t *zhp;
+
+ di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1);
+ (void) strncpy(di->ds, tosnap, tdslen);
+ di->ds[tdslen] = '\0';
+
+ zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM);
+ while (zhp != NULL) {
+ (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
+ origin, sizeof (origin), &src, NULL, 0, B_FALSE);
+
+ if (strncmp(origin, fromsnap, fsnlen) == 0)
+ break;
+
+ (void) zfs_close(zhp);
+ zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM);
+ }
+
+ if (zhp == NULL) {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Not an earlier snapshot from the same fs"));
+ return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+ } else {
+ (void) zfs_close(zhp);
+ }
+
+ di->isclone = B_TRUE;
+ di->fromsnap = zfs_strdup(hdl, fromsnap);
+ if (tsnlen) {
+ di->tosnap = zfs_strdup(hdl, tosnap);
+ } else {
+ return (make_temp_snapshot(di));
+ }
+ } else {
+ int dslen = fdslen ? fdslen : tdslen;
+
+ di->ds = zfs_alloc(hdl, dslen + 1);
+ (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen);
+ di->ds[dslen] = '\0';
+
+ di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf);
+ if (tsnlen) {
+ di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt);
+ } else {
+ return (make_temp_snapshot(di));
+ }
+ }
+ return (0);
+}
+
+static int
+get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt)
+{
+ boolean_t mounted;
+
+ mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt);
+ if (mounted == B_FALSE) {
+ (void) snprintf(di->errbuf, sizeof (di->errbuf),
+ dgettext(TEXT_DOMAIN,
+ "Cannot diff an unmounted snapshot"));
+ return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf));
+ }
+
+ /* Avoid a double slash at the beginning of root-mounted datasets */
+ if (**mntpt == '/' && *(*mntpt + 1) == '\0')
+ **mntpt = '\0';
+ return (0);
+}
+
+static int
+get_mountpoints(differ_info_t *di)
+{
+ char *strptr;
+ char *frommntpt;
+
+ /*
+ * first get the mountpoint for the parent dataset
+ */
+ if (get_mountpoint(di, di->ds, &di->dsmnt) != 0)
+ return (-1);
+
+ strptr = strchr(di->tosnap, '@');
+ ASSERT3P(strptr, !=, NULL);
+ di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt,
+ ZDIFF_SNAPDIR, ++strptr);
+
+ strptr = strchr(di->fromsnap, '@');
+ ASSERT3P(strptr, !=, NULL);
+
+ frommntpt = di->dsmnt;
+ if (di->isclone) {
+ char *mntpt;
+ int err;
+
+ *strptr = '\0';
+ err = get_mountpoint(di, di->fromsnap, &mntpt);
+ *strptr = '@';
+ if (err != 0)
+ return (-1);
+ frommntpt = mntpt;
+ }
+
+ di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt,
+ ZDIFF_SNAPDIR, ++strptr);
+
+ if (di->isclone)
+ free(frommntpt);
+
+ return (0);
+}
+
+static int
+setup_differ_info(zfs_handle_t *zhp, const char *fromsnap,
+ const char *tosnap, differ_info_t *di)
+{
+ di->zhp = zhp;
+
+ di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL);
+ VERIFY(di->cleanupfd >= 0);
+
+ if (get_snapshot_names(di, fromsnap, tosnap) != 0)
+ return (-1);
+
+ if (get_mountpoints(di) != 0)
+ return (-1);
+
+ if (find_shares_object(di) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap,
+ const char *tosnap, int flags)
+{
+ zfs_cmd_t zc = { 0 };
+ char errbuf[1024];
+ differ_info_t di = { 0 };
+ pthread_t tid;
+ int pipefd[2];
+ int iocerr;
+
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "zfs diff failed"));
+
+ if (setup_differ_info(zhp, fromsnap, tosnap, &di)) {
+ teardown_differ_info(&di);
+ return (-1);
+ }
+
+ if (pipe(pipefd)) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+ teardown_differ_info(&di);
+ return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf));
+ }
+
+ di.scripted = (flags & ZFS_DIFF_PARSEABLE);
+ di.classify = (flags & ZFS_DIFF_CLASSIFY);
+ di.timestamped = (flags & ZFS_DIFF_TIMESTAMP);
+
+ di.outputfd = outfd;
+ di.datafd = pipefd[0];
+
+ if (pthread_create(&tid, NULL, differ, &di)) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+ (void) close(pipefd[0]);
+ (void) close(pipefd[1]);
+ teardown_differ_info(&di);
+ return (zfs_error(zhp->zfs_hdl,
+ EZFS_THREADCREATEFAILED, errbuf));
+ }
+
+ /* do the ioctl() */
+ (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1);
+ (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1);
+ zc.zc_cookie = pipefd[1];
+
+ iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc);
+ if (iocerr != 0) {
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN, "Unable to obtain diffs"));
+ if (errno == EPERM) {
+ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+ "\n The sys_mount privilege or diff delegated "
+ "permission is needed\n to execute the "
+ "diff ioctl"));
+ } else if (errno == EXDEV) {
+ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+ "\n Not an earlier snapshot from the same fs"));
+ } else if (errno != EPIPE || di.zerr == 0) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+ }
+ (void) close(pipefd[1]);
+ (void) pthread_cancel(tid);
+ (void) pthread_join(tid, NULL);
+ teardown_differ_info(&di);
+ if (di.zerr != 0 && di.zerr != EPIPE) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+ return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+ } else {
+ return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf));
+ }
+ }
+
+ (void) close(pipefd[1]);
+ (void) pthread_join(tid, NULL);
+
+ if (di.zerr != 0) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+ return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+ }
+ teardown_differ_info(&di);
+ return (0);
+}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c
new file mode 100644
index 000000000000..788fa2cfb763
--- /dev/null
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c
@@ -0,0 +1,452 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <libintl.h>
+#include <link.h>
+#include <pthread.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <libzfs.h>
+
+#include <fm/libtopo.h>
+#include <sys/fm/protocol.h>
+#include <sys/systeminfo.h>
+
+#include "libzfs_impl.h"
+
+/*
+ * This file is responsible for determining the relationship between I/O
+ * devices paths and physical locations. In the world of MPxIO and external
+ * enclosures, the device path is not synonymous with the physical location.
+ * If you remove a drive and insert it into a different slot, it will end up
+ * with the same path under MPxIO. If you recable storage enclosures, the
+ * device paths may change. All of this makes it difficult to implement the
+ * 'autoreplace' property, which is supposed to automatically manage disk
+ * replacement based on physical slot.
+ *
+ * In order to work around these limitations, we have a per-vdev FRU property
+ * that is the libtopo path (minus disk-specific authority information) to the
+ * physical location of the device on the system. This is an optional
+ * property, and is only needed when using the 'autoreplace' property or when
+ * generating FMA faults against vdevs.
+ */
+
+/*
+ * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case
+ * it is not present. We only need this once per library instance, so it is
+ * not part of the libzfs handle.
+ */
+static void *_topo_dlhandle;
+static topo_hdl_t *(*_topo_open)(int, const char *, int *);
+static void (*_topo_close)(topo_hdl_t *);
+static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *);
+static void (*_topo_snap_release)(topo_hdl_t *);
+static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *,
+ topo_walk_cb_t, void *, int *);
+static int (*_topo_walk_step)(topo_walk_t *, int);
+static void (*_topo_walk_fini)(topo_walk_t *);
+static void (*_topo_hdl_strfree)(topo_hdl_t *, char *);
+static char *(*_topo_node_name)(tnode_t *);
+static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *,
+ char **, int *);
+static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *);
+static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *);
+static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *,
+ const char *);
+
+#define ZFS_FRU_HASH_SIZE 257
+
+static size_t
+fru_strhash(const char *key)
+{
+ ulong_t g, h = 0;
+ const char *p;
+
+ for (p = key; *p != '\0'; p++) {
+ h = (h << 4) + *p;
+
+ if ((g = (h & 0xf0000000)) != 0) {
+ h ^= (g >> 24);
+ h ^= g;
+ }
+ }
+
+ return (h % ZFS_FRU_HASH_SIZE);
+}
+
+static int
+libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg)
+{
+ libzfs_handle_t *hdl = arg;
+ nvlist_t *fru;
+ char *devpath, *frustr;
+ int err;
+ libzfs_fru_t *frup;
+ size_t idx;
+
+ /*
+ * If this is the chassis node, and we don't yet have the system
+ * chassis ID, then fill in this value now.
+ */
+ if (hdl->libzfs_chassis_id[0] == '\0' &&
+ strcmp(_topo_node_name(tn), "chassis") == 0) {
+ if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY,
+ FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0)
+ (void) strlcpy(hdl->libzfs_chassis_id, devpath,
+ sizeof (hdl->libzfs_chassis_id));
+ }
+
+ /*
+ * Skip non-disk nodes.
+ */
+ if (strcmp(_topo_node_name(tn), "disk") != 0)
+ return (TOPO_WALK_NEXT);
+
+ /*
+ * Get the devfs path and FRU.
+ */
+ if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0)
+ return (TOPO_WALK_NEXT);
+
+ if (libzfs_fru_lookup(hdl, devpath) != NULL) {
+ _topo_hdl_strfree(thp, devpath);
+ return (TOPO_WALK_NEXT);
+ }
+
+ if (_topo_node_fru(tn, &fru, NULL, &err) != 0) {
+ _topo_hdl_strfree(thp, devpath);
+ return (TOPO_WALK_NEXT);
+ }
+
+ /*
+ * Convert the FRU into a string.
+ */
+ if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) {
+ nvlist_free(fru);
+ _topo_hdl_strfree(thp, devpath);
+ return (TOPO_WALK_NEXT);
+ }
+
+ nvlist_free(fru);
+
+ /*
+ * Finally, we have a FRU string and device path. Add it to the hash.
+ */
+ if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) {
+ _topo_hdl_strfree(thp, devpath);
+ _topo_hdl_strfree(thp, frustr);
+ return (TOPO_WALK_NEXT);
+ }
+
+ if ((frup->zf_device = strdup(devpath)) == NULL ||
+ (frup->zf_fru = strdup(frustr)) == NULL) {
+ free(frup->zf_device);
+ free(frup);
+ _topo_hdl_strfree(thp, devpath);
+ _topo_hdl_strfree(thp, frustr);
+ return (TOPO_WALK_NEXT);
+ }
+
+ _topo_hdl_strfree(thp, devpath);
+ _topo_hdl_strfree(thp, frustr);
+
+ idx = fru_strhash(frup->zf_device);
+ frup->zf_chain = hdl->libzfs_fru_hash[idx];
+ hdl->libzfs_fru_hash[idx] = frup;
+ frup->zf_next = hdl->libzfs_fru_list;
+ hdl->libzfs_fru_list = frup;
+
+ return (TOPO_WALK_NEXT);
+}
+
+/*
+ * Called during initialization to setup the dynamic libtopo connection.
+ */
+#pragma init(libzfs_init_fru)
+static void
+libzfs_init_fru(void)
+{
+ char path[MAXPATHLEN];
+ char isa[257];
+
+#if defined(_LP64)
+ if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0)
+ isa[0] = '\0';
+#else
+ isa[0] = '\0';
+#endif
+ (void) snprintf(path, sizeof (path),
+ "/usr/lib/fm/%s/libtopo.so", isa);
+
+ if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL)
+ return;
+
+ _topo_open = (topo_hdl_t *(*)())
+ dlsym(_topo_dlhandle, "topo_open");
+ _topo_close = (void (*)())
+ dlsym(_topo_dlhandle, "topo_close");
+ _topo_snap_hold = (char *(*)())
+ dlsym(_topo_dlhandle, "topo_snap_hold");
+ _topo_snap_release = (void (*)())
+ dlsym(_topo_dlhandle, "topo_snap_release");
+ _topo_walk_init = (topo_walk_t *(*)())
+ dlsym(_topo_dlhandle, "topo_walk_init");
+ _topo_walk_step = (int (*)())
+ dlsym(_topo_dlhandle, "topo_walk_step");
+ _topo_walk_fini = (void (*)())
+ dlsym(_topo_dlhandle, "topo_walk_fini");
+ _topo_hdl_strfree = (void (*)())
+ dlsym(_topo_dlhandle, "topo_hdl_strfree");
+ _topo_node_name = (char *(*)())
+ dlsym(_topo_dlhandle, "topo_node_name");
+ _topo_prop_get_string = (int (*)())
+ dlsym(_topo_dlhandle, "topo_prop_get_string");
+ _topo_node_fru = (int (*)())
+ dlsym(_topo_dlhandle, "topo_node_fru");
+ _topo_fmri_nvl2str = (int (*)())
+ dlsym(_topo_dlhandle, "topo_fmri_nvl2str");
+ _topo_fmri_strcmp_noauth = (int (*)())
+ dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth");
+
+ if (_topo_open == NULL || _topo_close == NULL ||
+ _topo_snap_hold == NULL || _topo_snap_release == NULL ||
+ _topo_walk_init == NULL || _topo_walk_step == NULL ||
+ _topo_walk_fini == NULL || _topo_hdl_strfree == NULL ||
+ _topo_node_name == NULL || _topo_prop_get_string == NULL ||
+ _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL ||
+ _topo_fmri_strcmp_noauth == NULL) {
+ (void) dlclose(_topo_dlhandle);
+ _topo_dlhandle = NULL;
+ }
+}
+
+/*
+ * Refresh the mappings from device path -> FMRI. We do this by walking the
+ * hc topology looking for disk nodes, and recording the io/devfs-path and FRU.
+ * Note that we strip out the disk-specific authority information (serial,
+ * part, revision, etc) so that we are left with only the identifying
+ * characteristics of the slot (hc path and chassis-id).
+ */
+void
+libzfs_fru_refresh(libzfs_handle_t *hdl)
+{
+ int err;
+ char *uuid;
+ topo_hdl_t *thp;
+ topo_walk_t *twp;
+
+ if (_topo_dlhandle == NULL)
+ return;
+
+ /*
+ * Clear the FRU hash and initialize our basic structures.
+ */
+ libzfs_fru_clear(hdl, B_FALSE);
+
+ if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION,
+ NULL, &err)) == NULL)
+ return;
+
+ thp = hdl->libzfs_topo_hdl;
+
+ if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL)
+ return;
+
+ _topo_hdl_strfree(thp, uuid);
+
+ if (hdl->libzfs_fru_hash == NULL &&
+ (hdl->libzfs_fru_hash =
+ calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL)
+ return;
+
+ /*
+ * We now have a topo snapshot, so iterate over the hc topology looking
+ * for disks to add to the hash.
+ */
+ twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC,
+ libzfs_fru_gather, hdl, &err);
+ if (twp != NULL) {
+ (void) _topo_walk_step(twp, TOPO_WALK_CHILD);
+ _topo_walk_fini(twp);
+ }
+}
+
+/*
+ * Given a devfs path, return the FRU for the device, if known. This will
+ * automatically call libzfs_fru_refresh() if it hasn't already been called by
+ * the consumer. The string returned is valid until the next call to
+ * libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath)
+{
+ size_t idx = fru_strhash(devpath);
+ libzfs_fru_t *frup;
+
+ if (hdl->libzfs_fru_hash == NULL)
+ libzfs_fru_refresh(hdl);
+
+ if (hdl->libzfs_fru_hash == NULL)
+ return (NULL);
+
+ for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+ frup = frup->zf_chain) {
+ if (strcmp(devpath, frup->zf_device) == 0)
+ return (frup->zf_fru);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Given a fru path, return the device path. This will automatically call
+ * libzfs_fru_refresh() if it hasn't already been called by the consumer. The
+ * string returned is valid until the next call to libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru)
+{
+ libzfs_fru_t *frup;
+ size_t idx;
+
+ if (hdl->libzfs_fru_hash == NULL)
+ libzfs_fru_refresh(hdl);
+
+ if (hdl->libzfs_fru_hash == NULL)
+ return (NULL);
+
+ for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) {
+ for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+ frup = frup->zf_next) {
+ if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl,
+ fru, frup->zf_fru))
+ return (frup->zf_device);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Change the stored FRU for the given vdev.
+ */
+int
+zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru)
+{
+ zfs_cmd_t zc = { 0 };
+
+ (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+ (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
+ zc.zc_guid = vdev_guid;
+
+ if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0)
+ return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
+ dgettext(TEXT_DOMAIN, "cannot set FRU")));
+
+ return (0);
+}
+
+/*
+ * Compare to two FRUs, ignoring any authority information.
+ */
+boolean_t
+libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b)
+{
+ if (hdl->libzfs_fru_hash == NULL)
+ libzfs_fru_refresh(hdl);
+
+ if (hdl->libzfs_fru_hash == NULL)
+ return (strcmp(a, b) == 0);
+
+ return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b));
+}
+
+/*
+ * This special function checks to see whether the FRU indicates it's supposed
+ * to be in the system chassis, but the chassis-id doesn't match. This can
+ * happen in a clustered case, where both head nodes have the same logical
+ * disk, but opening the device on the other head node is meaningless.
+ */
+boolean_t
+libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru)
+{
+ const char *chassisid;
+ size_t len;
+
+ if (hdl->libzfs_fru_hash == NULL)
+ libzfs_fru_refresh(hdl);
+
+ if (hdl->libzfs_chassis_id[0] == '\0')
+ return (B_FALSE);
+
+ if (strstr(fru, "/chassis=0/") == NULL)
+ return (B_FALSE);
+
+ if ((chassisid = strstr(fru, ":chassis-id=")) == NULL)
+ return (B_FALSE);
+
+ chassisid += 12;
+ len = strlen(hdl->libzfs_chassis_id);
+ if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 &&
+ (chassisid[len] == '/' || chassisid[len] == ':'))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Clear memory associated with the FRU hash.
+ */
+void
+libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final)
+{
+ libzfs_fru_t *frup;
+
+ while ((frup = hdl->libzfs_fru_list) != NULL) {
+ hdl->libzfs_fru_list = frup->zf_next;
+ free(frup->zf_device);
+ free(frup->zf_fru);
+ free(frup);
+ }
+
+ hdl->libzfs_fru_list = NULL;
+
+ if (hdl->libzfs_topo_hdl != NULL) {
+ _topo_snap_release(hdl->libzfs_topo_hdl);
+ _topo_close(hdl->libzfs_topo_hdl);
+ hdl->libzfs_topo_hdl = NULL;
+ }
+
+ if (final) {
+ free(hdl->libzfs_fru_hash);
+ } else if (hdl->libzfs_fru_hash != NULL) {
+ bzero(hdl->libzfs_fru_hash,
+ ZFS_FRU_HASH_SIZE * sizeof (void *));
+ }
+}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
index 06420332c023..9d1ecb72f411 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _LIBFS_IMPL_H
@@ -30,7 +29,6 @@
#include <sys/dmu.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_ioctl.h>
-#include <sys/zfs_acl.h>
#include <sys/spa.h>
#include <sys/nvpair.h>
@@ -38,6 +36,8 @@
#include <libuutil.h>
#include <libzfs.h>
+#include "zfs_ioctl_compat.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -47,6 +47,13 @@ extern "C" {
#endif
#define VERIFY verify
+typedef struct libzfs_fru {
+ char *zf_device;
+ char *zf_fru;
+ struct libzfs_fru *zf_chain;
+ struct libzfs_fru *zf_next;
+} libzfs_fru_t;
+
struct libzfs_handle {
int libzfs_error;
int libzfs_fd;
@@ -61,11 +68,17 @@ struct libzfs_handle {
char libzfs_desc[1024];
char *libzfs_log_str;
int libzfs_printerr;
+ int libzfs_storeerr; /* stuff error messages into buffer */
void *libzfs_sharehdl; /* libshare handle */
uint_t libzfs_shareflags;
boolean_t libzfs_mnttab_enable;
avl_tree_t libzfs_mnttab_cache;
+ int libzfs_pool_iter;
+ libzfs_fru_t **libzfs_fru_hash;
+ libzfs_fru_t *libzfs_fru_list;
+ char libzfs_chassis_id[256];
};
+
#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */
struct zfs_handle {
@@ -77,6 +90,7 @@ struct zfs_handle {
dmu_objset_stats_t zfs_dmustats;
nvlist_t *zfs_props;
nvlist_t *zfs_user_props;
+ nvlist_t *zfs_recvd_props;
boolean_t zfs_mntcheck;
char *zfs_mntopts;
uint8_t *zfs_props_table;
@@ -112,7 +126,6 @@ typedef enum {
*/
typedef enum {
SHARED_NOT_SHARED = 0x0,
- SHARED_ISCSI = 0x1,
SHARED_NFS = 0x2,
SHARED_SMB = 0x4
} zfs_share_type_t;
@@ -122,6 +135,7 @@ int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...);
void zfs_error_aux(libzfs_handle_t *, const char *, ...);
void *zfs_alloc(libzfs_handle_t *, size_t);
void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t);
+char *zfs_asprintf(libzfs_handle_t *, const char *, ...);
char *zfs_strdup(libzfs_handle_t *, const char *);
int no_memory(libzfs_handle_t *);
@@ -172,11 +186,11 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
-int zvol_create_link(libzfs_handle_t *, const char *);
-int zvol_remove_link(libzfs_handle_t *, const char *);
-int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *);
boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
+int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
+ boolean_t modifying);
+
void namespace_clear(libzfs_handle_t *);
/*
@@ -190,7 +204,10 @@ extern int zfs_parse_options(char *, zfs_share_proto_t);
extern int zfs_unshare_proto(zfs_handle_t *,
const char *, zfs_share_proto_t *);
-#ifdef __FreeBSD__
+extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t);
+
+#ifndef sun
+static int zfs_kernel_version = 0;
/*
* This is FreeBSD version of ioctl, because Solaris' ioctl() updates
@@ -200,11 +217,23 @@ extern int zfs_unshare_proto(zfs_handle_t *,
static __inline int
zcmd_ioctl(int fd, unsigned long cmd, zfs_cmd_t *zc)
{
- size_t oldsize;
- int ret;
+ size_t oldsize, zfs_kernel_version_size;
+ int version, ret, cflag = ZFS_CMD_COMPAT_NONE;
+
+ zfs_kernel_version_size = sizeof(zfs_kernel_version);
+ if (zfs_kernel_version == 0) {
+ sysctlbyname("vfs.zfs.version.spa", &zfs_kernel_version,
+ &zfs_kernel_version_size, NULL, 0);
+ }
+
+ if (zfs_kernel_version == SPA_VERSION_15 ||
+ zfs_kernel_version == SPA_VERSION_14 ||
+ zfs_kernel_version == SPA_VERSION_13)
+ cflag = ZFS_CMD_COMPAT_V15;
oldsize = zc->zc_nvlist_dst_size;
- ret = ioctl(fd, cmd, zc);
+ ret = zcmd_ioctl_compat(fd, cmd, zc, cflag);
+
if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) {
ret = -1;
errno = ENOMEM;
@@ -213,7 +242,7 @@ zcmd_ioctl(int fd, unsigned long cmd, zfs_cmd_t *zc)
return (ret);
}
#define ioctl(fd, cmd, zc) zcmd_ioctl((fd), (cmd), (zc))
-#endif
+#endif /* !sun */
#ifdef __cplusplus
}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
index 166c8311ae45..4c31e56cf116 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Pool import support functions.
*
@@ -41,15 +38,18 @@
* using our derived config, and record the results.
*/
+#include <ctype.h>
#include <devid.h>
#include <dirent.h>
#include <errno.h>
#include <libintl.h>
+#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
+#include <thread_pool.h>
#include <libgeom.h>
#include <sys/vdev_impl.h>
@@ -113,6 +113,7 @@ get_devid(const char *path)
return (ret);
}
+
/*
* Go through and fix up any path and/or devid information for the given vdev
* configuration.
@@ -388,8 +389,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
}
if (err) {
- (void) zpool_standard_error(hdl, errno,
- dgettext(TEXT_DOMAIN, "cannot discover pools"));
zcmd_free_nvlists(&zc);
return (NULL);
}
@@ -404,6 +403,21 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
}
/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+ for (int c = 0; c < holes; c++) {
+
+ /* Top-level is a hole */
+ if (hole_array[c] == id)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
* Convert our list of pools into the definitive set of configurations. We
* start by picking the best config for each toplevel vdev. Once that's done,
* we assemble the toplevel vdevs into a full config for the pool. We make a
@@ -425,17 +439,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
uint64_t version, guid;
uint_t children = 0;
nvlist_t **child = NULL;
+ uint_t holes;
+ uint64_t *hole_array, max_id;
uint_t c;
boolean_t isactive;
uint64_t hostid;
nvlist_t *nvl;
boolean_t found_one = B_FALSE;
+ boolean_t valid_top_config = B_FALSE;
if (nvlist_alloc(&ret, 0, 0) != 0)
goto nomem;
for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
- uint64_t id;
+ uint64_t id, max_txg = 0;
if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
goto nomem;
@@ -463,6 +480,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
}
+ /*
+ * We rely on the fact that the max txg for the
+ * pool will contain the most up-to-date information
+ * about the valid top-levels in the vdev namespace.
+ */
+ if (best_txg > max_txg) {
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ DATA_TYPE_UINT64_ARRAY);
+
+ max_txg = best_txg;
+ hole_array = NULL;
+ holes = 0;
+ max_id = 0;
+ valid_top_config = B_FALSE;
+
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+ verify(nvlist_add_uint64(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ max_id) == 0);
+ valid_top_config = B_TRUE;
+ }
+
+ if (nvlist_lookup_uint64_array(tmp,
+ ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+ &holes) == 0) {
+ verify(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ hole_array, holes) == 0);
+ }
+ }
+
if (!config_seen) {
/*
* Copy the relevant pieces of data to the pool
@@ -522,6 +575,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
&id) == 0);
+
if (id >= children) {
nvlist_t **newchild;
@@ -542,17 +596,82 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
+ /*
+ * If we have information about all the top-levels then
+ * clean up the nvlist which we've constructed. This
+ * means removing any extraneous devices that are
+ * beyond the valid range or adding devices to the end
+ * of our array which appear to be missing.
+ */
+ if (valid_top_config) {
+ if (max_id < children) {
+ for (c = max_id; c < children; c++)
+ nvlist_free(child[c]);
+ children = max_id;
+ } else if (max_id > children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (max_id) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = max_id;
+ }
+ }
+
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&guid) == 0);
/*
+ * The vdev namespace may contain holes as a result of
+ * device removal. We must add them back into the vdev
+ * tree before we process any missing devices.
+ */
+ if (holes > 0) {
+ ASSERT(valid_top_config);
+
+ for (c = 0; c < children; c++) {
+ nvlist_t *holey;
+
+ if (child[c] != NULL ||
+ !vdev_is_hole(hole_array, holes, c))
+ continue;
+
+ if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+
+ /*
+ * Holes in the namespace are treated as
+ * "hole" top-level vdevs and have a
+ * special flag set on them.
+ */
+ if (nvlist_add_string(holey,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0)
+ goto nomem;
+ child[c] = holey;
+ }
+ }
+
+ /*
* Look for any missing top-level vdevs. If this is the case,
* create a faked up 'missing' vdev as a placeholder. We cannot
* simply compress the child array, because the kernel performs
* certain checks to make sure the vdev IDs match their location
* in the configuration.
*/
- for (c = 0; c < children; c++)
+ for (c = 0; c < children; c++) {
if (child[c] == NULL) {
nvlist_t *missing;
if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
@@ -570,6 +689,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
child[c] = missing;
}
+ }
/*
* Put all of this pool's top-level vdevs into a root vdev.
@@ -636,8 +756,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
continue;
}
- if ((nvl = refresh_config(hdl, config)) == NULL)
- goto error;
+ if ((nvl = refresh_config(hdl, config)) == NULL) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
nvlist_free(config);
config = nvl;
@@ -777,56 +900,216 @@ zpool_read_label(int fd, nvlist_t **config)
return (0);
}
+typedef struct rdsk_node {
+ char *rn_name;
+ int rn_dfd;
+ libzfs_handle_t *rn_hdl;
+ nvlist_t *rn_config;
+ avl_tree_t *rn_avl;
+ avl_node_t rn_node;
+ boolean_t rn_nozpool;
+} rdsk_node_t;
+
static int
-geom_find_import(libzfs_handle_t *hdl, pool_list_t *pools)
+slice_cache_compare(const void *arg1, const void *arg2)
{
- char path[MAXPATHLEN];
- struct gmesh mesh;
- struct gclass *mp;
- struct ggeom *gp;
- struct gprovider *pp;
+ const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+ const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+ char *nm1slice, *nm2slice;
+ int rv;
+
+ /*
+ * slices zero and two are the most likely to provide results,
+ * so put those first
+ */
+ nm1slice = strstr(nm1, "s0");
+ nm2slice = strstr(nm2, "s0");
+ if (nm1slice && !nm2slice) {
+ return (-1);
+ }
+ if (!nm1slice && nm2slice) {
+ return (1);
+ }
+ nm1slice = strstr(nm1, "s2");
+ nm2slice = strstr(nm2, "s2");
+ if (nm1slice && !nm2slice) {
+ return (-1);
+ }
+ if (!nm1slice && nm2slice) {
+ return (1);
+ }
+
+ rv = strcmp(nm1, nm2);
+ if (rv == 0)
+ return (0);
+ return (rv > 0 ? 1 : -1);
+}
+
+#ifdef sun
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+ diskaddr_t size, uint_t blksz)
+{
+ rdsk_node_t tmpnode;
+ rdsk_node_t *node;
+ char sname[MAXNAMELEN];
+
+ tmpnode.rn_name = &sname[0];
+ (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+ diskname, partno);
+ /*
+ * protect against division by zero for disk labels that
+ * contain a bogus sector size
+ */
+ if (blksz == 0)
+ blksz = DEV_BSIZE;
+ /* too small to contain a zpool? */
+ if ((size < (SPA_MINDEVSIZE / blksz)) &&
+ (node = avl_find(r, &tmpnode, NULL)))
+ node->rn_nozpool = B_TRUE;
+}
+#endif /* sun */
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+#ifdef sun
+ char diskname[MAXNAMELEN];
+ char *ptr;
+ int i;
+
+ (void) strncpy(diskname, sname, MAXNAMELEN);
+ if (((ptr = strrchr(diskname, 's')) == NULL) &&
+ ((ptr = strrchr(diskname, 'p')) == NULL))
+ return;
+ ptr[0] = 's';
+ ptr[1] = '\0';
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+ ptr[0] = 'p';
+ for (i = 0; i <= FD_NUMPART; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+#endif /* sun */
+}
+
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+#ifdef sun
+ struct extvtoc vtoc;
+ struct dk_gpt *gpt;
+ char diskname[MAXNAMELEN];
+ char *ptr;
+ int i;
+
+ (void) strncpy(diskname, sname, MAXNAMELEN);
+ if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+ return;
+ ptr[1] = '\0';
+
+ if (read_extvtoc(fd, &vtoc) >= 0) {
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i,
+ vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+ } else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+ /*
+ * on x86 we'll still have leftover links that point
+ * to slices s[9-15], so use NDKMAP instead
+ */
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i,
+ gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+ /* nodes p[1-4] are never used with EFI labels */
+ ptr[0] = 'p';
+ for (i = 1; i <= FD_NUMPART; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+ efi_free(gpt);
+ }
+#endif /* sun */
+}
+
+static void
+zpool_open_func(void *arg)
+{
+ rdsk_node_t *rn = arg;
+ struct stat64 statbuf;
nvlist_t *config;
- int fd, ret = 0;
+ int fd;
+ if (rn->rn_nozpool)
+ return;
+ if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+ /* symlink to a device that's no longer there */
+ if (errno == ENOENT)
+ nozpool_all_slices(rn->rn_avl, rn->rn_name);
+ return;
+ }
/*
- * Go through and read the label configuration information from every
- * GEOM provider, organizing the information according to pool GUID
- * and toplevel GUID.
+ * Ignore failed stats. We only want regular
+ * files, character devs and block devs.
*/
+ if (fstat64(fd, &statbuf) != 0 ||
+ (!S_ISREG(statbuf.st_mode) &&
+ !S_ISCHR(statbuf.st_mode) &&
+ !S_ISBLK(statbuf.st_mode))) {
+ (void) close(fd);
+ return;
+ }
+ /* this file is too small to hold a zpool */
+ if (S_ISREG(statbuf.st_mode) &&
+ statbuf.st_size < SPA_MINDEVSIZE) {
+ (void) close(fd);
+ return;
+ } else if (!S_ISREG(statbuf.st_mode)) {
+ /*
+ * Try to read the disk label first so we don't have to
+ * open a bunch of minor nodes that can't have a zpool.
+ */
+ check_slices(rn->rn_avl, fd, rn->rn_name);
+ }
- fd = geom_gettree(&mesh);
- assert(fd == 0);
+ if ((zpool_read_label(fd, &config)) != 0) {
+ (void) close(fd);
+ (void) no_memory(rn->rn_hdl);
+ return;
+ }
+ (void) close(fd);
- LIST_FOREACH(mp, &mesh.lg_class, lg_class) {
- LIST_FOREACH(gp, &mp->lg_geom, lg_geom) {
- LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
- if ((fd = g_open(pp->lg_name, 0)) < 0)
- continue;
- (void) snprintf(path, sizeof (path), "%s%s",
- _PATH_DEV, pp->lg_name);
+ rn->rn_config = config;
+ if (config != NULL) {
+ assert(rn->rn_nozpool == B_FALSE);
+ }
+}
- if ((zpool_read_label(fd, &config)) != 0) {
- (void) g_close(fd);
- (void) no_memory(hdl);
- goto error;
- }
+/*
+ * Given a file descriptor, clear (zero) the label information. This function
+ * is currently only used in the appliance stack as part of the ZFS sysevent
+ * module.
+ */
+int
+zpool_clear_label(int fd)
+{
+ struct stat64 statbuf;
+ int l;
+ vdev_label_t *label;
+ uint64_t size;
- (void) g_close(fd);
+ if (fstat64(fd, &statbuf) == -1)
+ return (0);
+ size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
- if (config == NULL)
- continue;
+ if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
+ return (-1);
- if (add_config(hdl, pools, path, config) != 0) {
- ret = -1;
- goto error;
- }
- }
- }
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (pwrite64(fd, label, sizeof (vdev_label_t),
+ label_offset(size, l)) != sizeof (vdev_label_t))
+ return (-1);
}
-error:
- geom_deletetree(&mesh);
- return (ret);
+
+ free(label);
+ return (0);
}
/*
@@ -837,30 +1120,28 @@ error:
* to import a specific pool.
*/
static nvlist_t *
-zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
- boolean_t active_ok, char *poolname, uint64_t guid)
+zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
{
- int i;
+ int i, dirs = iarg->paths;
DIR *dirp = NULL;
struct dirent64 *dp;
char path[MAXPATHLEN];
- char *end;
+ char *end, **dir = iarg->path;
size_t pathleft;
- struct stat64 statbuf;
- nvlist_t *ret = NULL, *config;
+ nvlist_t *ret = NULL;
static char *default_dir = "/dev/dsk";
- int fd;
pool_list_t pools = { 0 };
pool_entry_t *pe, *penext;
vdev_entry_t *ve, *venext;
config_entry_t *ce, *cenext;
name_entry_t *ne, *nenext;
+ avl_tree_t slice_cache;
+ rdsk_node_t *slice;
+ void *cookie;
- verify(poolname == NULL || guid == 0);
-
- if (argc == 0) {
- argc = 1;
- argv = &default_dir;
+ if (dirs == 0) {
+ dirs = 1;
+ dir = &default_dir;
}
/*
@@ -868,15 +1149,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
* possible device, organizing the information according to pool GUID
* and toplevel GUID.
*/
- for (i = 0; i < argc; i++) {
+ for (i = 0; i < dirs; i++) {
+ tpool_t *t;
char *rdsk;
int dfd;
/* use realpath to normalize the path */
- if (realpath(argv[i], path) == 0) {
+ if (realpath(dir[i], path) == 0) {
(void) zfs_error_fmt(hdl, EZFS_BADPATH,
- dgettext(TEXT_DOMAIN, "cannot open '%s'"),
- argv[i]);
+ dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
goto error;
}
end = &path[strlen(path)];
@@ -884,22 +1165,18 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
*end = 0;
pathleft = &path[sizeof (path)] - end;
- if (strcmp(argv[i], default_dir) == 0) {
- geom_find_import(hdl, &pools);
- continue;
- }
-
/*
* Using raw devices instead of block devices when we're
* reading the labels skips a bunch of slow operations during
* close(2) processing, so we replace /dev/dsk with /dev/rdsk.
*/
if (strcmp(path, "/dev/dsk/") == 0)
- rdsk = "/dev/rdsk/";
+ rdsk = "/dev/";
else
rdsk = path;
- if ((dirp = opendir(rdsk)) == NULL) {
+ if ((dfd = open64(rdsk, O_RDONLY)) < 0 ||
+ (dirp = fdopendir(dfd)) == NULL) {
zfs_error_aux(hdl, strerror(errno));
(void) zfs_error_fmt(hdl, EZFS_BADPATH,
dgettext(TEXT_DOMAIN, "cannot open '%s'"),
@@ -907,6 +1184,41 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
goto error;
}
+ avl_create(&slice_cache, slice_cache_compare,
+ sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
+
+ if (strcmp(rdsk, "/dev/") == 0) {
+ struct gmesh mesh;
+ struct gclass *mp;
+ struct ggeom *gp;
+ struct gprovider *pp;
+
+ errno = geom_gettree(&mesh);
+ if (errno != 0) {
+ zfs_error_aux(hdl, strerror(errno));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH,
+ dgettext(TEXT_DOMAIN, "cannot get GEOM tree"));
+ goto error;
+ }
+
+ LIST_FOREACH(mp, &mesh.lg_class, lg_class) {
+ LIST_FOREACH(gp, &mp->lg_geom, lg_geom) {
+ LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, pp->lg_name);
+ slice->rn_avl = &slice_cache;
+ slice->rn_dfd = dfd;
+ slice->rn_hdl = hdl;
+ slice->rn_nozpool = B_FALSE;
+ avl_add(&slice_cache, slice);
+ }
+ }
+ }
+
+ geom_deletetree(&mesh);
+ goto skipdir;
+ }
+
/*
* This is not MT-safe, but we have no MT consumers of libzfs
*/
@@ -916,49 +1228,54 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
(name[1] == 0 || (name[1] == '.' && name[2] == 0)))
continue;
- (void) snprintf(path, sizeof (path), "%s/%s", rdsk,
- dp->d_name);
-
- if ((fd = open64(path, O_RDONLY)) < 0)
- continue;
-
- /*
- * Ignore failed stats. We only want regular
- * files, character devs and block devs.
- */
- if (fstat64(fd, &statbuf) != 0 ||
- (!S_ISREG(statbuf.st_mode) &&
- !S_ISCHR(statbuf.st_mode) &&
- !S_ISBLK(statbuf.st_mode))) {
- (void) close(fd);
- continue;
- }
-
- if ((zpool_read_label(fd, &config)) != 0) {
- (void) close(fd);
- (void) no_memory(hdl);
- goto error;
- }
-
- (void) close(fd);
-
- if (config != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, name);
+ slice->rn_avl = &slice_cache;
+ slice->rn_dfd = dfd;
+ slice->rn_hdl = hdl;
+ slice->rn_nozpool = B_FALSE;
+ avl_add(&slice_cache, slice);
+ }
+skipdir:
+ /*
+ * create a thread pool to do all of this in parallel;
+ * rn_nozpool is not protected, so this is racy in that
+ * multiple tasks could decide that the same slice can
+ * not hold a zpool, which is benign. Also choose
+ * double the number of processors; we hold a lot of
+ * locks in the kernel, so going beyond this doesn't
+ * buy us much.
+ */
+ t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
+ 0, NULL);
+ for (slice = avl_first(&slice_cache); slice;
+ (slice = avl_walk(&slice_cache, slice,
+ AVL_AFTER)))
+ (void) tpool_dispatch(t, zpool_open_func, slice);
+ tpool_wait(t);
+ tpool_destroy(t);
+
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(&slice_cache,
+ &cookie)) != NULL) {
+ if (slice->rn_config != NULL) {
+ nvlist_t *config = slice->rn_config;
boolean_t matched = B_TRUE;
- if (poolname != NULL) {
+ if (iarg->poolname != NULL) {
char *pname;
matched = nvlist_lookup_string(config,
ZPOOL_CONFIG_POOL_NAME,
&pname) == 0 &&
- strcmp(poolname, pname) == 0;
- } else if (guid != 0) {
+ strcmp(iarg->poolname, pname) == 0;
+ } else if (iarg->guid != 0) {
uint64_t this_guid;
matched = nvlist_lookup_uint64(config,
ZPOOL_CONFIG_POOL_GUID,
&this_guid) == 0 &&
- guid == this_guid;
+ iarg->guid == this_guid;
}
if (!matched) {
nvlist_free(config);
@@ -966,17 +1283,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
continue;
}
/* use the non-raw path for the config */
- (void) strlcpy(end, name, pathleft);
+ (void) strlcpy(end, slice->rn_name, pathleft);
if (add_config(hdl, &pools, path, config) != 0)
goto error;
}
+ free(slice->rn_name);
+ free(slice);
}
+ avl_destroy(&slice_cache);
(void) closedir(dirp);
dirp = NULL;
}
- ret = get_configs(hdl, &pools, active_ok);
+ ret = get_configs(hdl, &pools, iarg->can_be_active);
error:
for (pe = pools.pools; pe != NULL; pe = penext) {
@@ -1010,27 +1330,12 @@ error:
nvlist_t *
zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
{
- return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, 0));
-}
+ importargs_t iarg = { 0 };
-nvlist_t *
-zpool_find_import_byname(libzfs_handle_t *hdl, int argc, char **argv,
- char *pool)
-{
- return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, pool, 0));
-}
+ iarg.paths = argc;
+ iarg.path = argv;
-nvlist_t *
-zpool_find_import_byguid(libzfs_handle_t *hdl, int argc, char **argv,
- uint64_t guid)
-{
- return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, guid));
-}
-
-nvlist_t *
-zpool_find_import_activeok(libzfs_handle_t *hdl, int argc, char **argv)
-{
- return (zpool_find_import_impl(hdl, argc, argv, B_TRUE, NULL, 0));
+ return (zpool_find_import_impl(hdl, &iarg));
}
/*
@@ -1152,6 +1457,46 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
return (pools);
}
+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+ importargs_t *import = data;
+ int found = 0;
+
+ if (import->poolname != NULL) {
+ char *pool_name;
+
+ verify(nvlist_lookup_string(zhp->zpool_config,
+ ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
+ if (strcmp(pool_name, import->poolname) == 0)
+ found = 1;
+ } else {
+ uint64_t pool_guid;
+
+ verify(nvlist_lookup_uint64(zhp->zpool_config,
+ ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
+ if (pool_guid == import->guid)
+ found = 1;
+ }
+
+ zpool_close(zhp);
+ return (found);
+}
+
+nvlist_t *
+zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
+{
+ verify(import->poolname == NULL || import->guid == 0);
+
+ if (import->unique)
+ import->exists = zpool_iter(hdl, name_or_guid_exists, import);
+
+ if (import->cachefile != NULL)
+ return (zpool_find_import_cached(hdl, import->cachefile,
+ import->poolname, import->guid));
+
+ return (zpool_find_import_impl(hdl, import));
+}
boolean_t
find_guid(nvlist_t *nv, uint64_t guid)
@@ -1251,6 +1596,17 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
switch (stateval) {
case POOL_STATE_EXPORTED:
+ /*
+ * A pool with an exported state may in fact be imported
+ * read-only, so check the in-core state to see if it's
+ * active and imported read-only. If it is, set
+ * its state to active.
+ */
+ if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
+ (zhp = zpool_open_canfail(hdl, name)) != NULL &&
+ zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
+ stateval = POOL_STATE_ACTIVE;
+
ret = B_TRUE;
break;
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
index 56c0968ec2da..b2959dd1b841 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -44,17 +43,14 @@
*
* zfs_is_shared_nfs()
* zfs_is_shared_smb()
- * zfs_is_shared_iscsi()
* zfs_share_proto()
* zfs_shareall();
- * zfs_share_iscsi()
* zfs_unshare_nfs()
* zfs_unshare_smb()
* zfs_unshareall_nfs()
* zfs_unshareall_smb()
* zfs_unshareall()
* zfs_unshareall_bypath()
- * zfs_unshare_iscsi()
*
* The following functions are available for pool consumers, and will
* mount/unmount and share/unshare all datasets within pool:
@@ -82,18 +78,12 @@
#include "libzfs_impl.h"
#include <libshare.h>
-
#define MAXISALEN 257 /* based on sysinfo(2) man page */
static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
zfs_share_proto_t);
-static int (*iscsitgt_zfs_share)(const char *);
-static int (*iscsitgt_zfs_unshare)(const char *);
-static int (*iscsitgt_zfs_is_shared)(const char *);
-static int (*iscsitgt_svc_online)();
-
/*
* The share protocols table must be in the same order as the zfs_share_prot_t
* enum in libzfs_impl.h
@@ -125,29 +115,6 @@ zfs_share_proto_t share_all_proto[] = {
PROTO_END
};
-#pragma init(zfs_iscsi_init)
-static void
-zfs_iscsi_init(void)
-{
- void *libiscsitgt;
-
- if ((libiscsitgt = dlopen("/lib/libiscsitgt.so.1",
- RTLD_LAZY | RTLD_GLOBAL)) == NULL ||
- (iscsitgt_zfs_share = (int (*)(const char *))dlsym(libiscsitgt,
- "iscsitgt_zfs_share")) == NULL ||
- (iscsitgt_zfs_unshare = (int (*)(const char *))dlsym(libiscsitgt,
- "iscsitgt_zfs_unshare")) == NULL ||
- (iscsitgt_zfs_is_shared = (int (*)(const char *))dlsym(libiscsitgt,
- "iscsitgt_zfs_is_shared")) == NULL ||
- (iscsitgt_svc_online = (int (*)(const char *))dlsym(libiscsitgt,
- "iscsitgt_svc_online")) == NULL) {
- iscsitgt_zfs_share = NULL;
- iscsitgt_zfs_unshare = NULL;
- iscsitgt_zfs_is_shared = NULL;
- iscsitgt_svc_online = NULL;
- }
-}
-
/*
* Search the sharetab for the given mountpoint and protocol, returning
* a zfs_share_type_t value.
@@ -171,7 +138,7 @@ is_shared(libzfs_handle_t *hdl, const char *mountpoint, zfs_share_proto_t proto)
*tab = '\0';
if (strcmp(buf, mountpoint) == 0) {
-#if defined(sun)
+#ifdef sun
/*
* the protocol field is the third field
* skip over second field
@@ -204,7 +171,7 @@ is_shared(libzfs_handle_t *hdl, const char *mountpoint, zfs_share_proto_t proto)
return (SHARED_NOT_SHARED);
}
-#if 0
+#ifdef sun
/*
* Returns true if the specified directory is empty. If we can't open the
* directory at all, return true so that the mount can fail with a more
@@ -309,6 +276,12 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
else
(void) strlcpy(mntopts, options, sizeof (mntopts));
+ /*
+ * If the pool is imported read-only then all mounts must be read-only
+ */
+ if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL))
+ flags |= MS_RDONLY;
+
if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
return (0);
@@ -323,7 +296,7 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
}
}
-#if 0 /* FreeBSD: overlay mounts are not checked. */
+#ifdef sun /* FreeBSD: overlay mounts are not checked. */
/*
* Determine if the mountpoint is empty. If so, refuse to perform the
* mount. We don't perform this check if MS_OVERLAY is specified, which
@@ -354,6 +327,18 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
} else if (errno == EPERM) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Insufficient privileges"));
+ } else if (errno == ENOTSUP) {
+ char buf[256];
+ int spa_version;
+
+ VERIFY(zfs_spa_version(zhp, &spa_version) == 0);
+ (void) snprintf(buf, sizeof (buf),
+ dgettext(TEXT_DOMAIN, "Can't mount a version %lld "
+ "file system on a version %d pool. Pool must be"
+ " upgraded to mount this file system."),
+ (u_longlong_t)zfs_prop_get_int(zhp,
+ ZFS_PROP_VERSION), spa_version);
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf));
} else {
zfs_error_aux(hdl, strerror(errno));
}
@@ -374,7 +359,7 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
static int
unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
{
- if (unmount(mountpoint, flags) != 0) {
+ if (umount2(mountpoint, flags) != 0) {
zfs_error_aux(hdl, strerror(errno));
return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED,
dgettext(TEXT_DOMAIN, "cannot unmount '%s'"),
@@ -454,7 +439,7 @@ zfs_is_shared(zfs_handle_t *zhp)
zfs_share_proto_t *curr_proto;
if (ZFS_IS_VOLUME(zhp))
- return (zfs_is_shared_iscsi(zhp));
+ return (B_FALSE);
for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
curr_proto++)
@@ -466,18 +451,14 @@ zfs_is_shared(zfs_handle_t *zhp)
int
zfs_share(zfs_handle_t *zhp)
{
- if (ZFS_IS_VOLUME(zhp))
- return (zfs_share_iscsi(zhp));
-
+ assert(!ZFS_IS_VOLUME(zhp));
return (zfs_share_proto(zhp, share_all_proto));
}
int
zfs_unshare(zfs_handle_t *zhp)
{
- if (ZFS_IS_VOLUME(zhp))
- return (zfs_unshare_iscsi(zhp));
-
+ assert(!ZFS_IS_VOLUME(zhp));
return (zfs_unshareall(zhp));
}
@@ -525,7 +506,7 @@ zfs_is_shared_smb(zfs_handle_t *zhp, char **where)
* initialized in _zfs_init_libshare() are actually present.
*/
-#if 0
+#ifdef sun
static sa_handle_t (*_sa_init)(int);
static void (*_sa_fini)(sa_handle_t);
static sa_share_t (*_sa_find_share)(sa_handle_t, char *);
@@ -552,7 +533,7 @@ static void (*_sa_update_sharetab_ts)(sa_handle_t);
static void
_zfs_init_libshare(void)
{
-#if 0
+#ifdef sun
void *libshare;
char path[MAXPATHLEN];
char isa[MAXISALEN];
@@ -623,7 +604,7 @@ zfs_init_libshare(libzfs_handle_t *zhandle, int service)
{
int ret = SA_OK;
-#if 0
+#ifdef sun
if (_sa_init == NULL)
ret = SA_CONFIG_ERR;
@@ -664,7 +645,7 @@ void
zfs_uninit_libshare(libzfs_handle_t *zhandle)
{
if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) {
-#if 0
+#ifdef sun
if (_sa_fini != NULL)
_sa_fini(zhandle->libzfs_sharehdl);
#endif
@@ -681,7 +662,7 @@ zfs_uninit_libshare(libzfs_handle_t *zhandle)
int
zfs_parse_options(char *options, zfs_share_proto_t proto)
{
-#if 0
+#ifdef sun
if (_sa_parse_legacy_options != NULL) {
return (_sa_parse_legacy_options(NULL, options,
proto_table[proto].p_name));
@@ -692,7 +673,7 @@ zfs_parse_options(char *options, zfs_share_proto_t proto)
#endif
}
-#if 0
+#ifdef sun
/*
* zfs_sa_find_share(handle, path)
*
@@ -734,7 +715,7 @@ zfs_sa_disable_share(sa_share_t share, char *proto)
return (_sa_disable_share(share, proto));
return (SA_CONFIG_ERR);
}
-#endif
+#endif /* sun */
/*
* Share the given filesystem according to the options in the specified
@@ -755,6 +736,16 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto)
if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
return (0);
+#ifdef sun
+ if ((ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) {
+ (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED,
+ dgettext(TEXT_DOMAIN, "cannot share '%s': %s"),
+ zfs_get_name(zhp), _sa_errorstr != NULL ?
+ _sa_errorstr(ret) : "");
+ return (-1);
+ }
+#endif
+
for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) {
/*
* Return success if there are no share options.
@@ -774,13 +765,7 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto)
if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED))
continue;
- if (*curr_proto != PROTO_NFS) {
- fprintf(stderr, "Unsupported share protocol: %d.\n",
- *curr_proto);
- continue;
- }
-
-#if 0
+#ifdef sun
share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint);
if (share == NULL) {
/*
@@ -819,6 +804,12 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto)
}
} else
#else
+ if (*curr_proto != PROTO_NFS) {
+ fprintf(stderr, "Unsupported share protocol: %d.\n",
+ *curr_proto);
+ continue;
+ }
+
if (strcmp(shareopts, "on") == 0)
error = fsshare(ZFS_EXPORTS_PATH, mountpoint, "");
else
@@ -832,6 +823,7 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto)
zfs_get_name(zhp));
return (-1);
}
+
}
return (0);
}
@@ -862,23 +854,58 @@ static int
unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint,
zfs_share_proto_t proto)
{
+#ifdef sun
+ sa_share_t share;
+ int err;
+ char *mntpt;
+ /*
+ * Mountpoint could get trashed if libshare calls getmntany
+ * which it does during API initialization, so strdup the
+ * value.
+ */
+ mntpt = zfs_strdup(hdl, mountpoint);
+
+ /* make sure libshare initialized */
+ if ((err = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) {
+ free(mntpt); /* don't need the copy anymore */
+ return (zfs_error_fmt(hdl, EZFS_SHARENFSFAILED,
+ dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"),
+ name, _sa_errorstr(err)));
+ }
+
+ share = zfs_sa_find_share(hdl->libzfs_sharehdl, mntpt);
+ free(mntpt); /* don't need the copy anymore */
+
+ if (share != NULL) {
+ err = zfs_sa_disable_share(share, proto_table[proto].p_name);
+ if (err != SA_OK) {
+ return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED,
+ dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"),
+ name, _sa_errorstr(err)));
+ }
+ } else {
+ return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED,
+ dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"),
+ name));
+ }
+#else
char buf[MAXPATHLEN];
FILE *fp;
- int error;
+ int err;
if (proto != PROTO_NFS) {
fprintf(stderr, "No SMB support in FreeBSD yet.\n");
return (EOPNOTSUPP);
}
- error = fsunshare(ZFS_EXPORTS_PATH, mountpoint);
- if (error != 0) {
- zfs_error_aux(hdl, "%s", strerror(error));
+ err = fsunshare(ZFS_EXPORTS_PATH, mountpoint);
+ if (err != 0) {
+ zfs_error_aux(hdl, "%s", strerror(err));
return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED,
dgettext(TEXT_DOMAIN,
"cannot unshare '%s'"), name));
}
-
+#endif
return (0);
}
@@ -1011,99 +1038,29 @@ remove_mountpoint(zfs_handle_t *zhp)
}
}
-boolean_t
-zfs_is_shared_iscsi(zfs_handle_t *zhp)
-{
-
- /*
- * If iscsi deamon isn't running then we aren't shared
- */
- if (iscsitgt_svc_online && iscsitgt_svc_online() == 1)
- return (B_FALSE);
- else
- return (iscsitgt_zfs_is_shared != NULL &&
- iscsitgt_zfs_is_shared(zhp->zfs_name) != 0);
-}
-
-int
-zfs_share_iscsi(zfs_handle_t *zhp)
-{
- char shareopts[ZFS_MAXPROPLEN];
- const char *dataset = zhp->zfs_name;
- libzfs_handle_t *hdl = zhp->zfs_hdl;
-
- /*
- * Return success if there are no share options.
- */
- if (zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
- sizeof (shareopts), NULL, NULL, 0, B_FALSE) != 0 ||
- strcmp(shareopts, "off") == 0)
- return (0);
-
-/* We don't support iSCSI on FreeBSD yet. */
-#ifdef TODO
- if (iscsitgt_zfs_share == NULL || iscsitgt_zfs_share(dataset) != 0) {
- int error = EZFS_SHAREISCSIFAILED;
-
- /*
- * If service isn't availabele and EPERM was
- * returned then use special error.
- */
- if (iscsitgt_svc_online && errno == EPERM &&
- (iscsitgt_svc_online() != 0))
- error = EZFS_ISCSISVCUNAVAIL;
-
- return (zfs_error_fmt(hdl, error,
- dgettext(TEXT_DOMAIN, "cannot share '%s'"), dataset));
- }
-#endif
-
- return (0);
-}
-
-int
-zfs_unshare_iscsi(zfs_handle_t *zhp)
+void
+libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
{
- const char *dataset = zfs_get_name(zhp);
- libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-/* We don't support iSCSI on FreeBSD yet. */
-#ifdef TODO
- /*
- * Return if the volume is not shared
- */
- if (zfs_is_shared_iscsi(zhp) != SHARED_ISCSI)
- return (0);
+ if (cbp->cb_alloc == cbp->cb_used) {
+ size_t newsz;
+ void *ptr;
- /*
- * If this fails with ENODEV it indicates that zvol wasn't shared so
- * we should return success in that case.
- */
- if (iscsitgt_zfs_unshare == NULL ||
- (iscsitgt_zfs_unshare(dataset) != 0 && errno != ENODEV)) {
- if (errno == EPERM)
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "Insufficient privileges to unshare iscsi"));
- return (zfs_error_fmt(hdl, EZFS_UNSHAREISCSIFAILED,
- dgettext(TEXT_DOMAIN, "cannot unshare '%s'"), dataset));
+ newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
+ ptr = zfs_realloc(zhp->zfs_hdl,
+ cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
+ newsz * sizeof (void *));
+ cbp->cb_handles = ptr;
+ cbp->cb_alloc = newsz;
}
-#endif
-
- return (0);
+ cbp->cb_handles[cbp->cb_used++] = zhp;
}
-typedef struct mount_cbdata {
- zfs_handle_t **cb_datasets;
- int cb_used;
- int cb_alloc;
-} mount_cbdata_t;
-
static int
mount_cb(zfs_handle_t *zhp, void *data)
{
- mount_cbdata_t *cbp = data;
+ get_all_cb_t *cbp = data;
- if (!(zfs_get_type(zhp) & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) {
+ if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
zfs_close(zhp);
return (0);
}
@@ -1113,25 +1070,16 @@ mount_cb(zfs_handle_t *zhp, void *data)
return (0);
}
- if (cbp->cb_alloc == cbp->cb_used) {
- void *ptr;
-
- if ((ptr = zfs_realloc(zhp->zfs_hdl,
- cbp->cb_datasets, cbp->cb_alloc * sizeof (void *),
- cbp->cb_alloc * 2 * sizeof (void *))) == NULL)
- return (-1);
- cbp->cb_datasets = ptr;
-
- cbp->cb_alloc *= 2;
+ libzfs_add_handle(cbp, zhp);
+ if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+ zfs_close(zhp);
+ return (-1);
}
-
- cbp->cb_datasets[cbp->cb_used++] = zhp;
-
- return (zfs_iter_filesystems(zhp, mount_cb, cbp));
+ return (0);
}
-static int
-dataset_cmp(const void *a, const void *b)
+int
+libzfs_dataset_cmp(const void *a, const void *b)
{
zfs_handle_t **za = (zfs_handle_t **)a;
zfs_handle_t **zb = (zfs_handle_t **)b;
@@ -1169,7 +1117,7 @@ dataset_cmp(const void *a, const void *b)
int
zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
{
- mount_cbdata_t cb = { 0 };
+ get_all_cb_t cb = { 0 };
libzfs_handle_t *hdl = zhp->zpool_hdl;
zfs_handle_t *zfsp;
int i, ret = -1;
@@ -1178,23 +1126,17 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
/*
* Gather all non-snap datasets within the pool.
*/
- if ((cb.cb_datasets = zfs_alloc(hdl, 4 * sizeof (void *))) == NULL)
- return (-1);
- cb.cb_alloc = 4;
-
if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
goto out;
- cb.cb_datasets[0] = zfsp;
- cb.cb_used = 1;
-
+ libzfs_add_handle(&cb, zfsp);
if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
goto out;
-
/*
* Sort the datasets by mountpoint.
*/
- qsort(cb.cb_datasets, cb.cb_used, sizeof (void *), dataset_cmp);
+ qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
+ libzfs_dataset_cmp);
/*
* And mount all the datasets, keeping track of which ones
@@ -1206,7 +1148,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
ret = 0;
for (i = 0; i < cb.cb_used; i++) {
- if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0)
+ if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
ret = -1;
else
good[i] = 1;
@@ -1219,7 +1161,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
* zfs_alloc is supposed to exit if memory isn't available.
*/
for (i = 0; i < cb.cb_used; i++) {
- if (good[i] && zfs_share(cb.cb_datasets[i]) != 0)
+ if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
ret = -1;
}
@@ -1227,34 +1169,12 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
out:
for (i = 0; i < cb.cb_used; i++)
- zfs_close(cb.cb_datasets[i]);
- free(cb.cb_datasets);
+ zfs_close(cb.cb_handles[i]);
+ free(cb.cb_handles);
return (ret);
}
-
-static int
-zvol_cb(const char *dataset, void *data)
-{
- libzfs_handle_t *hdl = data;
- zfs_handle_t *zhp;
-
- /*
- * Ignore snapshots and ignore failures from non-existant datasets.
- */
- if (strchr(dataset, '@') != NULL ||
- (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL)
- return (0);
-
- if (zfs_unshare_iscsi(zhp) != 0)
- return (-1);
-
- zfs_close(zhp);
-
- return (0);
-}
-
static int
mountpoint_compare(const void *a, const void *b)
{
@@ -1264,6 +1184,8 @@ mountpoint_compare(const void *a, const void *b)
return (strcmp(mountb, mounta));
}
+/* alias for 2002/240 */
+#pragma weak zpool_unmount_datasets = zpool_disable_datasets
/*
* Unshare and unmount all datasets within the given pool. We don't want to
* rely on traversing the DSL to discover the filesystems within the pool,
@@ -1271,46 +1193,38 @@ mountpoint_compare(const void *a, const void *b)
* arbitrarily (on I/O error, for example). Instead, we walk /etc/mnttab and
* gather all the filesystems that are currently mounted.
*/
-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
int
zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
{
int used, alloc;
- struct statfs *sfs;
+ struct mnttab entry;
size_t namelen;
char **mountpoints = NULL;
zfs_handle_t **datasets = NULL;
libzfs_handle_t *hdl = zhp->zpool_hdl;
- int i, j, n;
+ int i;
int ret = -1;
int flags = (force ? MS_FORCE : 0);
- /*
- * First unshare all zvols.
- */
- if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0)
- return (-1);
-
namelen = strlen(zhp->zpool_name);
+ rewind(hdl->libzfs_mnttab);
used = alloc = 0;
- if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
- fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
- return (-1);
- }
- for (j = 0; j < n; j++) {
+ while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
/*
* Ignore non-ZFS entries.
*/
- if (strcmp(sfs[j].f_fstypename, MNTTYPE_ZFS) != 0)
+ if (entry.mnt_fstype == NULL ||
+ strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
continue;
/*
* Ignore filesystems not within this pool.
*/
- if (strncmp(sfs[j].f_mntfromname, zhp->zpool_name, namelen) != 0 ||
- (sfs[j].f_mntfromname[namelen] != '/' &&
- sfs[j].f_mntfromname[namelen] != '\0'))
+ if (entry.mnt_mountp == NULL ||
+ strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 ||
+ (entry.mnt_special[namelen] != '/' &&
+ entry.mnt_special[namelen] != '\0'))
continue;
/*
@@ -1348,7 +1262,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
}
if ((mountpoints[used] = zfs_strdup(hdl,
- sfs[j].f_mntonname)) == NULL)
+ entry.mnt_mountp)) == NULL)
goto out;
/*
@@ -1356,7 +1270,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
* is only used to determine if we need to remove the underlying
* mountpoint, so failure is not fatal.
*/
- datasets[used] = make_dataset_handle(hdl, sfs[j].f_mntfromname);
+ datasets[used] = make_dataset_handle(hdl, entry.mnt_special);
used++;
}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
index c7edd2e8a9cd..c2306ec3d00e 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
@@ -20,41 +20,38 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
#include <sys/stat.h>
-#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <devid.h>
-#include <dirent.h>
#include <fcntl.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
-#include <zone.h>
#include <sys/zfs_ioctl.h>
-#include <sys/zio.h>
-#include <umem.h>
+#include <dlfcn.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
+#include "zfs_comutil.h"
static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
-#ifdef sun
-#if defined(__i386) || defined(__amd64)
-#define BOOTCMD "installgrub(1M)"
-#else
-#define BOOTCMD "installboot(1M)"
-#endif
-#endif /* sun */
+#define DISK_ROOT "/dev/dsk"
+#define RDISK_ROOT "/dev/rdsk"
+#define BACKUP_SLICE "s2"
+
+typedef struct prop_flags {
+ int create:1; /* Validate property on creation */
+ int import:1; /* Validate property on import */
+} prop_flags_t;
/*
* ====================================================================
@@ -189,6 +186,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
case VDEV_STATE_CANT_OPEN:
if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
return (gettext("FAULTED"));
+ else if (aux == VDEV_AUX_SPLIT_POOL)
+ return (gettext("SPLIT"));
else
return (gettext("UNAVAIL"));
case VDEV_STATE_FAULTED:
@@ -269,8 +268,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
switch (prop) {
case ZPOOL_PROP_SIZE:
- case ZPOOL_PROP_USED:
- case ZPOOL_PROP_AVAILABLE:
+ case ZPOOL_PROP_ALLOCATED:
+ case ZPOOL_PROP_FREE:
(void) zfs_nicenum(intval, buf, len);
break;
@@ -279,11 +278,18 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
(u_longlong_t)intval);
break;
+ case ZPOOL_PROP_DEDUPRATIO:
+ (void) snprintf(buf, len, "%llu.%02llux",
+ (u_longlong_t)(intval / 100),
+ (u_longlong_t)(intval % 100));
+ break;
+
case ZPOOL_PROP_HEALTH:
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot,
- ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
(void) strlcpy(buf, zpool_state_to_name(intval,
vs->vs_aux), len);
@@ -311,17 +317,6 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
return (0);
}
-static boolean_t
-pool_is_bootable(zpool_handle_t *zhp)
-{
- char bootfs[ZPOOL_MAXNAMELEN];
-
- return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
- sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-",
- sizeof (bootfs)) != 0);
-}
-
-
/*
* Check if the bootfs name has the same pool name as it is set to.
* Assuming bootfs is a valid dataset name.
@@ -364,6 +359,17 @@ pool_uses_efi(nvlist_t *config)
return (B_FALSE);
}
+static boolean_t
+pool_is_bootable(zpool_handle_t *zhp)
+{
+ char bootfs[ZPOOL_MAXNAMELEN];
+
+ return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
+ sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-",
+ sizeof (bootfs)) != 0);
+}
+
+
/*
* Given an nvlist of zpool properties to be set, validate that they are
* correct, and parse any numeric properties (index, boolean, etc) if they are
@@ -371,7 +377,7 @@ pool_uses_efi(nvlist_t *config)
*/
static nvlist_t *
zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
- nvlist_t *props, uint64_t version, boolean_t create_or_import, char *errbuf)
+ nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
{
nvpair_t *elem;
nvlist_t *retprops;
@@ -428,7 +434,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
break;
case ZPOOL_PROP_BOOTFS:
- if (create_or_import) {
+ if (flags.create || flags.import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' cannot be set at creation "
"or import time"), propname);
@@ -465,7 +471,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-#if defined(sun)
+#ifdef sun
/*
* bootfs property cannot be set on a disk which has
* been EFI labeled.
@@ -478,12 +484,12 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
zpool_close(zhp);
goto error;
}
-#endif
+#endif /* sun */
zpool_close(zhp);
break;
case ZPOOL_PROP_ALTROOT:
- if (!create_or_import) {
+ if (!flags.create && !flags.import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set during pool "
"creation or import"), propname);
@@ -538,6 +544,16 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
*slash = '/';
break;
+
+ case ZPOOL_PROP_READONLY:
+ if (!flags.import) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "property '%s' can only be set at "
+ "import time"), propname);
+ (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+ goto error;
+ }
+ break;
}
}
@@ -559,6 +575,7 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
nvlist_t *nvl = NULL;
nvlist_t *realprops;
uint64_t version;
+ prop_flags_t flags = { 0 };
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -574,7 +591,7 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
- zhp->zpool_name, nvl, version, B_FALSE, errbuf)) == NULL) {
+ zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
nvlist_free(nvl);
return (-1);
}
@@ -633,6 +650,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
/*
+ * Don't start the slice at the default block of 34; many storage
+ * devices will use a stripe width of 128k, so start there instead.
+ */
+#define NEW_START_BLOCK 256
+
+/*
* Validate the given pool name, optionally putting an extended error message in
* 'buf'.
*/
@@ -875,8 +898,10 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
return (-1);
if (props) {
+ prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
+
if ((zc_props = zpool_valid_proplist(hdl, pool, props,
- SPA_VERSION_1, B_TRUE, msg)) == NULL) {
+ SPA_VERSION_1, flags, msg)) == NULL) {
goto create_failed;
}
}
@@ -994,16 +1019,12 @@ zpool_destroy(zpool_handle_t *zhp)
char msg[1024];
if (zhp->zpool_state == POOL_STATE_ACTIVE &&
- (zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
- ZFS_TYPE_FILESYSTEM)) == NULL)
- return (-1);
-
- if (zpool_remove_zvol_links(zhp) != 0)
+ (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+ if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot destroy '%s'"), zhp->zpool_name);
@@ -1066,7 +1087,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device '%s' contains an EFI label and "
"cannot be used on root pools."),
- zpool_vdev_name(hdl, NULL, spares[s]));
+ zpool_vdev_name(hdl, NULL, spares[s],
+ B_FALSE));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
}
}
@@ -1085,7 +1107,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
switch (errno) {
case EBUSY:
/*
@@ -1161,9 +1183,6 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
zfs_cmd_t zc = { 0 };
char msg[1024];
- if (zpool_remove_zvol_links(zhp) != 0)
- return (-1);
-
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot export '%s'"), zhp->zpool_name);
@@ -1202,6 +1221,132 @@ zpool_export_force(zpool_handle_t *zhp)
return (zpool_export_common(zhp, B_TRUE, B_TRUE));
}
+static void
+zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
+ nvlist_t *config)
+{
+ nvlist_t *nv = NULL;
+ uint64_t rewindto;
+ int64_t loss = -1;
+ struct tm t;
+ char timestr[128];
+
+ if (!hdl->libzfs_printerr || config == NULL)
+ return;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0)
+ return;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+ return;
+ (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+ if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+ strftime(timestr, 128, 0, &t) != 0) {
+ if (dryrun) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Would be able to return %s "
+ "to its state as of %s.\n"),
+ name, timestr);
+ } else {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Pool %s returned to its state as of %s.\n"),
+ name, timestr);
+ }
+ if (loss > 120) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "%s approximately %lld "),
+ dryrun ? "Would discard" : "Discarded",
+ (loss + 30) / 60);
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "minutes of transactions.\n"));
+ } else if (loss > 0) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "%s approximately %lld "),
+ dryrun ? "Would discard" : "Discarded", loss);
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "seconds of transactions.\n"));
+ }
+ }
+}
+
+void
+zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
+ nvlist_t *config)
+{
+ nvlist_t *nv = NULL;
+ int64_t loss = -1;
+ uint64_t edata = UINT64_MAX;
+ uint64_t rewindto;
+ struct tm t;
+ char timestr[128];
+
+ if (!hdl->libzfs_printerr)
+ return;
+
+ if (reason >= 0)
+ (void) printf(dgettext(TEXT_DOMAIN, "action: "));
+ else
+ (void) printf(dgettext(TEXT_DOMAIN, "\t"));
+
+ /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+ nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+ goto no_info;
+
+ (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+ &edata);
+
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Recovery is possible, but will result in some data loss.\n"));
+
+ if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+ strftime(timestr, 128, 0, &t) != 0) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "\tReturning the pool to its state as of %s\n"
+ "\tshould correct the problem. "),
+ timestr);
+ } else {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "\tReverting the pool to an earlier state "
+ "should correct the problem.\n\t"));
+ }
+
+ if (loss > 120) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Approximately %lld minutes of data\n"
+ "\tmust be discarded, irreversibly. "), (loss + 30) / 60);
+ } else if (loss > 0) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Approximately %lld seconds of data\n"
+ "\tmust be discarded, irreversibly. "), loss);
+ }
+ if (edata != 0 && edata != UINT64_MAX) {
+ if (edata == 1) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "After rewind, at least\n"
+ "\tone persistent user-data error will remain. "));
+ } else {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "After rewind, several\n"
+ "\tpersistent user-data errors will remain. "));
+ }
+ }
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
+ reason >= 0 ? "clear" : "import", name);
+
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "A scrub of the pool\n"
+ "\tis strongly recommended after recovery.\n"));
+ return;
+
+no_info:
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "Destroy and re-create the pool from\n\ta backup source.\n"));
+}
+
/*
* zpool_import() is a contracted interface. Should be kept the same
* if possible.
@@ -1234,12 +1379,40 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
}
}
- ret = zpool_import_props(hdl, config, newname, props, B_FALSE);
+ ret = zpool_import_props(hdl, config, newname, props,
+ ZFS_IMPORT_NORMAL);
if (props)
nvlist_free(props);
return (ret);
}
+static void
+print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
+ int indent)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *vname;
+ uint64_t is_log = 0;
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+
+ if (name != NULL)
+ (void) printf("\t%*s%s%s\n", indent, "", name,
+ is_log ? " [log]" : "");
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
+ print_vdev_tree(hdl, vname, child[c], indent + 2);
+ free(vname);
+ }
+}
+
/*
* Import the given pool using the known configuration and a list of
* properties to be set. The configuration should have come from
@@ -1248,12 +1421,17 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
*/
int
zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
- nvlist_t *props, boolean_t importfaulted)
+ nvlist_t *props, int flags)
{
zfs_cmd_t zc = { 0 };
+ zpool_rewind_policy_t policy;
+ nvlist_t *nv = NULL;
+ nvlist_t *nvinfo = NULL;
+ nvlist_t *missing = NULL;
char *thename;
char *origname;
int ret;
+ int error = 0;
char errbuf[1024];
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
@@ -1274,12 +1452,13 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
if (props) {
uint64_t version;
+ prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
if ((props = zpool_valid_proplist(hdl, origname,
- props, version, B_TRUE, errbuf)) == NULL) {
+ props, version, flags, errbuf)) == NULL) {
return (-1);
} else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
nvlist_free(props);
@@ -1296,11 +1475,39 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_free(props);
return (-1);
}
+ if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
+ nvlist_free(props);
+ return (-1);
+ }
- zc.zc_cookie = (uint64_t)importfaulted;
- ret = 0;
- if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
+ zc.zc_cookie = flags;
+ while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
+ errno == ENOMEM) {
+ if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+ zcmd_free_nvlists(&zc);
+ return (-1);
+ }
+ }
+ if (ret != 0)
+ error = errno;
+
+ (void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
+ zpool_get_rewind_policy(config, &policy);
+
+ if (error) {
char desc[1024];
+
+ /*
+ * Dry-run failed, but we print out what success
+ * looks like if we found a best txg
+ */
+ if (policy.zrp_request & ZPOOL_TRY_REWIND) {
+ zpool_rewind_exclaim(hdl, newname ? origname : thename,
+ B_TRUE, nv);
+ nvlist_free(nv);
+ return (-1);
+ }
+
if (newname == NULL)
(void) snprintf(desc, sizeof (desc),
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
@@ -1310,7 +1517,7 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
origname, thename);
- switch (errno) {
+ switch (error) {
case ENOTSUP:
/*
* Unsupported version.
@@ -1322,10 +1529,38 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
break;
+ case EROFS:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "one or more devices is read only"));
+ (void) zfs_error(hdl, EZFS_BADDEV, desc);
+ break;
+
+ case ENXIO:
+ if (nv && nvlist_lookup_nvlist(nv,
+ ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+ nvlist_lookup_nvlist(nvinfo,
+ ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
+ (void) printf(dgettext(TEXT_DOMAIN,
+ "The devices below are missing, use "
+ "'-m' to import the pool anyway:\n"));
+ print_vdev_tree(hdl, NULL, missing, 2);
+ (void) printf("\n");
+ }
+ (void) zpool_standard_error(hdl, error, desc);
+ break;
+
+ case EEXIST:
+ (void) zpool_standard_error(hdl, error, desc);
+ break;
+
default:
- (void) zpool_standard_error(hdl, errno, desc);
+ (void) zpool_standard_error(hdl, error, desc);
+ zpool_explain_recover(hdl,
+ newname ? origname : thename, -error, nv);
+ break;
}
+ nvlist_free(nv);
ret = -1;
} else {
zpool_handle_t *zhp;
@@ -1333,13 +1568,17 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
/*
* This should never fail, but play it safe anyway.
*/
- if (zpool_open_silent(hdl, thename, &zhp) != 0) {
+ if (zpool_open_silent(hdl, thename, &zhp) != 0)
ret = -1;
- } else if (zhp != NULL) {
- ret = zpool_create_zvol_links(zhp);
+ else if (zhp != NULL)
zpool_close(zhp);
+ if (policy.zrp_request &
+ (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+ zpool_rewind_exclaim(hdl, newname ? origname : thename,
+ ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
}
-
+ nvlist_free(nv);
+ return (0);
}
zcmd_free_nvlists(&zc);
@@ -1349,71 +1588,235 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
}
/*
- * Scrub the pool.
+ * Scan the pool.
*/
int
-zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- zc.zc_cookie = type;
+ zc.zc_cookie = func;
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+ if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+ (errno == ENOENT && func != POOL_SCAN_NONE))
return (0);
- (void) snprintf(msg, sizeof (msg),
- dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+ if (func == POOL_SCAN_SCRUB) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+ } else if (func == POOL_SCAN_NONE) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
+ zc.zc_name);
+ } else {
+ assert(!"unexpected result");
+ }
- if (errno == EBUSY)
- return (zfs_error(hdl, EZFS_RESILVERING, msg));
- else
+ if (errno == EBUSY) {
+ nvlist_t *nvroot;
+ pool_scan_stat_t *ps = NULL;
+ uint_t psc;
+
+ verify(nvlist_lookup_nvlist(zhp->zpool_config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
+ if (ps && ps->pss_func == POOL_SCAN_SCRUB)
+ return (zfs_error(hdl, EZFS_SCRUBBING, msg));
+ else
+ return (zfs_error(hdl, EZFS_RESILVERING, msg));
+ } else if (errno == ENOENT) {
+ return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+ } else {
return (zpool_standard_error(hdl, errno, msg));
+ }
+}
+
+/*
+ * This provides a very minimal check whether a given string is likely a
+ * c#t#d# style string. Users of this are expected to do their own
+ * verification of the s# part.
+ */
+#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1]))
+
+/*
+ * More elaborate version for ones which may start with "/dev/dsk/"
+ * and the like.
+ */
+static int
+ctd_check_path(char *str) {
+ /*
+ * If it starts with a slash, check the last component.
+ */
+ if (str && str[0] == '/') {
+ char *tmp = strrchr(str, '/');
+
+ /*
+ * If it ends in "/old", check the second-to-last
+ * component of the string instead.
+ */
+ if (tmp != str && strcmp(tmp, "/old") == 0) {
+ for (tmp--; *tmp != '/'; tmp--)
+ ;
+ }
+ str = tmp + 1;
+ }
+ return (CTD_CHECK(str));
}
/*
+ * Find a vdev that matches the search criteria specified. We use the
+ * the nvpair name to determine how we should look for the device.
* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
* spare; but FALSE if its an INUSE spare.
*/
static nvlist_t *
-vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
- boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
+vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
+ boolean_t *l2cache, boolean_t *log)
{
uint_t c, children;
nvlist_t **child;
- uint64_t theguid, present;
- char *path;
- uint64_t wholedisk = 0;
nvlist_t *ret;
uint64_t is_log;
+ char *srchkey;
+ nvpair_t *pair = nvlist_next_nvpair(search, NULL);
- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0);
+ /* Nothing to look for */
+ if (search == NULL || pair == NULL)
+ return (NULL);
+
+ /* Obtain the key we will use to search */
+ srchkey = nvpair_name(pair);
+
+ switch (nvpair_type(pair)) {
+ case DATA_TYPE_UINT64:
+ if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
+ uint64_t srchval, theguid;
+
+ verify(nvpair_value_uint64(pair, &srchval) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+ &theguid) == 0);
+ if (theguid == srchval)
+ return (nv);
+ }
+ break;
+
+ case DATA_TYPE_STRING: {
+ char *srchval, *val;
+
+ verify(nvpair_value_string(pair, &srchval) == 0);
+ if (nvlist_lookup_string(nv, srchkey, &val) != 0)
+ break;
- if (search == NULL &&
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) {
/*
- * If the device has never been present since import, the only
- * reliable way to match the vdev is by GUID.
+ * Search for the requested value. Special cases:
+ *
+ * - ZPOOL_CONFIG_PATH for whole disk entries. These end in
+ * "s0" or "s0/old". The "s0" part is hidden from the user,
+ * but included in the string, so this matches around it.
+ * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+ *
+ * Otherwise, all other searches are simple string compares.
*/
- if (theguid == guid)
- return (nv);
- } else if (search != NULL &&
- nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- &wholedisk);
- if (wholedisk) {
+ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
+ ctd_check_path(val)) {
+ uint64_t wholedisk = 0;
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk);
+ if (wholedisk) {
+ int slen = strlen(srchval);
+ int vlen = strlen(val);
+
+ if (slen != vlen - 2)
+ break;
+
+ /*
+ * make_leaf_vdev() should only set
+ * wholedisk for ZPOOL_CONFIG_PATHs which
+ * will include "/dev/dsk/", giving plenty of
+ * room for the indices used next.
+ */
+ ASSERT(vlen >= 6);
+
+ /*
+ * strings identical except trailing "s0"
+ */
+ if (strcmp(&val[vlen - 2], "s0") == 0 &&
+ strncmp(srchval, val, slen) == 0)
+ return (nv);
+
+ /*
+ * strings identical except trailing "s0/old"
+ */
+ if (strcmp(&val[vlen - 6], "s0/old") == 0 &&
+ strcmp(&srchval[slen - 4], "/old") == 0 &&
+ strncmp(srchval, val, slen - 4) == 0)
+ return (nv);
+
+ break;
+ }
+ } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+ char *type, *idx, *end, *p;
+ uint64_t id, vdev_id;
+
+ /*
+ * Determine our vdev type, keeping in mind
+ * that the srchval is composed of a type and
+ * vdev id pair (i.e. mirror-4).
+ */
+ if ((type = strdup(srchval)) == NULL)
+ return (NULL);
+
+ if ((p = strrchr(type, '-')) == NULL) {
+ free(type);
+ break;
+ }
+ idx = p + 1;
+ *p = '\0';
+
+ /*
+ * If the types don't match then keep looking.
+ */
+ if (strncmp(val, type, strlen(val)) != 0) {
+ free(type);
+ break;
+ }
+
+ verify(strncmp(type, VDEV_TYPE_RAIDZ,
+ strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+ strncmp(type, VDEV_TYPE_MIRROR,
+ strlen(VDEV_TYPE_MIRROR)) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &id) == 0);
+
+ errno = 0;
+ vdev_id = strtoull(idx, &end, 10);
+
+ free(type);
+ if (errno != 0)
+ return (NULL);
+
/*
- * For whole disks, the internal path has 's0', but the
- * path passed in by the user doesn't.
+ * Now verify that we have the correct vdev id.
*/
- if (strlen(search) == strlen(path) - 2 &&
- strncmp(search, path, strlen(search)) == 0)
+ if (vdev_id == id)
return (nv);
- } else if (strcmp(search, path) == 0) {
- return (nv);
}
+
+ /*
+ * Common case
+ */
+ if (strcmp(srchval, val) == 0)
+ return (nv);
+ break;
+ }
+
+ default:
+ break;
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
@@ -1421,7 +1824,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
return (NULL);
for (c = 0; c < children; c++) {
- if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+ if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
/*
* The 'is_log' value is only set for the toplevel
@@ -1442,7 +1845,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
- if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+ if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
*avail_spare = B_TRUE;
return (ret);
@@ -1453,7 +1856,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
- if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+ if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
*l2cache = B_TRUE;
return (ret);
@@ -1464,24 +1867,65 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
return (NULL);
}
+/*
+ * Given a physical path (minus the "/devices" prefix), find the
+ * associated vdev.
+ */
+nvlist_t *
+zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
+ boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
+{
+ nvlist_t *search, *nvroot, *ret;
+
+ verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
+
+ verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ *avail_spare = B_FALSE;
+ *l2cache = B_FALSE;
+ if (log != NULL)
+ *log = B_FALSE;
+ ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+ nvlist_free(search);
+
+ return (ret);
+}
+
+/*
+ * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
+ */
+boolean_t
+zpool_vdev_is_interior(const char *name)
+{
+ if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+ strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
{
char buf[MAXPATHLEN];
- const char *search;
char *end;
- nvlist_t *nvroot;
+ nvlist_t *nvroot, *search, *ret;
uint64_t guid;
+ verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
guid = strtoull(path, &end, 10);
if (guid != 0 && *end == '\0') {
- search = NULL;
+ verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+ } else if (zpool_vdev_is_interior(path)) {
+ verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
} else if (path[0] != '/') {
(void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path);
- search = buf;
+ verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
} else {
- search = path;
+ verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
}
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
@@ -1491,8 +1935,10 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
*l2cache = B_FALSE;
if (log != NULL)
*log = B_FALSE;
- return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare,
- l2cache, log));
+ ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+ nvlist_free(search);
+
+ return (ret);
}
static int
@@ -1509,106 +1955,180 @@ vdev_online(nvlist_t *nv)
}
/*
- * Get phys_path for a root pool
- * Return 0 on success; non-zeron on failure.
+ * Helper function for zpool_get_physpaths().
*/
-int
-zpool_get_physpath(zpool_handle_t *zhp, char *physpath)
+static int
+vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
+ size_t *bytes_written)
{
+ size_t bytes_left, pos, rsz;
+ char *tmppath;
+ const char *format;
+
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
+ &tmppath) != 0)
+ return (EZFS_NODEVICE);
+
+ pos = *bytes_written;
+ bytes_left = physpath_size - pos;
+ format = (pos == 0) ? "%s" : " %s";
+
+ rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
+ *bytes_written += rsz;
+
+ if (rsz >= bytes_left) {
+ /* if physpath was not copied properly, clear it */
+ if (bytes_left != 0) {
+ physpath[pos] = 0;
+ }
+ return (EZFS_NOSPC);
+ }
+ return (0);
+}
+
+static int
+vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
+ size_t *rsz, boolean_t is_spare)
+{
+ char *type;
+ int ret;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (EZFS_INVALCONFIG);
+
+ if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+ /*
+ * An active spare device has ZPOOL_CONFIG_IS_SPARE set.
+ * For a spare vdev, we only want to boot from the active
+ * spare device.
+ */
+ if (is_spare) {
+ uint64_t spare = 0;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare);
+ if (!spare)
+ return (EZFS_INVALCONFIG);
+ }
+
+ if (vdev_online(nv)) {
+ if ((ret = vdev_get_one_physpath(nv, physpath,
+ phypath_size, rsz)) != 0)
+ return (ret);
+ }
+ } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
+ strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
+ (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
+ nvlist_t **child;
+ uint_t count;
+ int i, ret;
+
+ if (nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
+ return (EZFS_INVALCONFIG);
+
+ for (i = 0; i < count; i++) {
+ ret = vdev_get_physpaths(child[i], physpath,
+ phypath_size, rsz, is_spare);
+ if (ret == EZFS_NOSPC)
+ return (ret);
+ }
+ }
+
+ return (EZFS_POOL_INVALARG);
+}
+
+/*
+ * Get phys_path for a root pool config.
+ * Return 0 on success; non-zero on failure.
+ */
+static int
+zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
+{
+ size_t rsz;
nvlist_t *vdev_root;
nvlist_t **child;
uint_t count;
- int i;
+ char *type;
- /*
- * Make sure this is a root pool, as phys_path doesn't mean
- * anything to a non-root pool.
- */
- if (!pool_is_bootable(zhp))
- return (-1);
+ rsz = 0;
- verify(nvlist_lookup_nvlist(zhp->zpool_config,
- ZPOOL_CONFIG_VDEV_TREE, &vdev_root) == 0);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &vdev_root) != 0)
+ return (EZFS_INVALCONFIG);
- if (nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
+ if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
+ nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
&child, &count) != 0)
- return (-2);
+ return (EZFS_INVALCONFIG);
- for (i = 0; i < count; i++) {
- nvlist_t **child2;
- uint_t count2;
- char *type;
- char *tmppath;
- int j;
+ /*
+ * root pool can not have EFI labeled disks and can only have
+ * a single top-level vdev.
+ */
+ if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
+ pool_uses_efi(vdev_root))
+ return (EZFS_POOL_INVALARG);
- if (nvlist_lookup_string(child[i], ZPOOL_CONFIG_TYPE, &type)
- != 0)
- return (-3);
-
- if (strcmp(type, VDEV_TYPE_DISK) == 0) {
- if (!vdev_online(child[i]))
- return (-8);
- verify(nvlist_lookup_string(child[i],
- ZPOOL_CONFIG_PHYS_PATH, &tmppath) == 0);
- (void) strncpy(physpath, tmppath, strlen(tmppath));
- } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) {
- if (nvlist_lookup_nvlist_array(child[i],
- ZPOOL_CONFIG_CHILDREN, &child2, &count2) != 0)
- return (-4);
-
- for (j = 0; j < count2; j++) {
- if (!vdev_online(child2[j]))
- return (-8);
- if (nvlist_lookup_string(child2[j],
- ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0)
- return (-5);
-
- if ((strlen(physpath) + strlen(tmppath)) >
- MAXNAMELEN)
- return (-6);
-
- if (strlen(physpath) == 0) {
- (void) strncpy(physpath, tmppath,
- strlen(tmppath));
- } else {
- (void) strcat(physpath, " ");
- (void) strcat(physpath, tmppath);
- }
- }
- } else {
- return (-7);
- }
- }
+ (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
+ B_FALSE);
+
+ /* No online devices */
+ if (rsz == 0)
+ return (EZFS_NODEVICE);
return (0);
}
/*
- * Returns TRUE if the given guid corresponds to the given type.
- * This is used to check for hot spares (INUSE or not), and level 2 cache
- * devices.
+ * Get phys_path for a root pool
+ * Return 0 on success; non-zero on failure.
*/
-static boolean_t
-is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
+int
+zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
{
- uint64_t target_guid;
- nvlist_t *nvroot;
- nvlist_t **list;
- uint_t count;
- int i;
+ return (zpool_get_config_physpath(zhp->zpool_config, physpath,
+ phypath_size));
+}
- verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) {
- for (i = 0; i < count; i++) {
- verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID,
- &target_guid) == 0);
- if (guid == target_guid)
- return (B_TRUE);
- }
+/*
+ * If the device has being dynamically expanded then we need to relabel
+ * the disk to use the new unallocated space.
+ */
+static int
+zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
+{
+#ifdef sun
+ char path[MAXPATHLEN];
+ char errbuf[1024];
+ int fd, error;
+ int (*_efi_use_whole_disk)(int);
+
+ if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
+ "efi_use_whole_disk")) == NULL)
+ return (-1);
+
+ (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name);
+
+ if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+ "relabel '%s': unable to open device"), name);
+ return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
- return (B_FALSE);
+ /*
+ * It's possible that we might encounter an error if the device
+ * does not have any unallocated space left. If so, we simply
+ * ignore that error and continue on.
+ */
+ error = _efi_use_whole_disk(fd);
+ (void) close(fd);
+ if (error && error != VT_ENOSPC) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+ "relabel '%s': unable to read disk capacity"), name);
+ return (zfs_error(hdl, EZFS_NOCAP, errbuf));
+ }
+#endif /* sun */
+ return (0);
}
/*
@@ -1622,28 +2142,64 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare, l2cache;
+ boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl;
- (void) snprintf(msg, sizeof (msg),
- dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+ if (flags & ZFS_ONLINE_EXPAND) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
+ } else {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+ }
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
- NULL)) == NULL)
+ &islog)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
- if (avail_spare ||
- is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+ if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
+ if (flags & ZFS_ONLINE_EXPAND ||
+ zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+ char *pathname = NULL;
+ uint64_t wholedisk = 0;
+
+ (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk);
+ verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
+ &pathname) == 0);
+
+ /*
+ * XXX - L2ARC 1.0 devices can't support expansion.
+ */
+ if (l2cache) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "cannot expand cache devices"));
+ return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
+ }
+
+ if (wholedisk) {
+ pathname += strlen(DISK_ROOT) + 1;
+ (void) zpool_relabel_disk(hdl, pathname);
+ }
+ }
+
zc.zc_cookie = VDEV_STATE_ONLINE;
zc.zc_obj = flags;
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0)
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+ if (errno == EINVAL) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
+ "from this pool into a new one. Use '%s' "
+ "instead"), "zpool detach");
+ return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
+ }
return (zpool_standard_error(hdl, errno, msg));
+ }
*newstate = zc.zc_cookie;
return (0);
@@ -1671,14 +2227,13 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
- if (avail_spare ||
- is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+ if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
zc.zc_cookie = VDEV_STATE_OFFLINE;
zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
@@ -1689,6 +2244,12 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
+ case EEXIST:
+ /*
+ * The log device has unplayed logs
+ */
+ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
+
default:
return (zpool_standard_error(hdl, errno, msg));
}
@@ -1698,7 +2259,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
* Mark the given vdev faulted.
*/
int
-zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
@@ -1710,8 +2271,9 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_FAULTED;
+ zc.zc_obj = aux;
- if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
@@ -1722,12 +2284,6 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
- case EEXIST:
- /*
- * The log device has unplayed logs
- */
- return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
-
default:
return (zpool_standard_error(hdl, errno, msg));
}
@@ -1738,7 +2294,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
* Mark the given vdev degraded.
*/
int
-zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
@@ -1750,8 +2306,9 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_DEGRADED;
+ zc.zc_obj = aux;
- if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
@@ -1799,7 +2356,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
uint64_t val;
- char *path, *newname;
+ char *newname;
nvlist_t **child;
uint_t children;
nvlist_t *config_root;
@@ -1847,7 +2404,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
- if ((newname = zpool_vdev_name(NULL, NULL, child[0])) == NULL)
+ if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
return (-1);
/*
@@ -1865,32 +2422,25 @@ zpool_vdev_attach(zpool_handle_t *zhp,
return (zfs_error(hdl, EZFS_BADTARGET, msg));
}
- /*
- * If we are attempting to replace a spare, it canot be applied to an
- * already spared device.
- */
- if (replacing &&
- nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
- zpool_find_vdev(zhp, newname, &avail_spare,
- &l2cache, NULL) != NULL && avail_spare &&
- is_replacing_spare(config_root, tgt, 0)) {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "device has already been replaced with a spare"));
- free(newname);
- return (zfs_error(hdl, EZFS_BADTARGET, msg));
- }
-
free(newname);
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
- ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc);
+ ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
zcmd_free_nvlists(&zc);
if (ret == 0) {
if (rootpool) {
+ /*
+ * XXX need a better way to prevent user from
+ * booting up a half-baked vdev.
+ */
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
+ "sure to wait until resilver is done "
+ "before rebooting.\n"));
+ (void) fprintf(stderr, "\n");
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If "
"you boot from pool '%s', you may need to update\n"
"boot code on newly attached disk '%s'.\n\n"
@@ -1910,9 +2460,16 @@ zpool_vdev_attach(zpool_handle_t *zhp,
* Can't attach to or replace this type of vdev.
*/
if (replacing) {
+ uint64_t version = zpool_get_prop_int(zhp,
+ ZPOOL_PROP_VERSION, NULL);
+
if (islog)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a log with a spare"));
+ else if (version >= SPA_VERSION_MULTI_REPLACE)
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "already in replacing/spare config; wait "
+ "for completion or use 'zpool detach'"));
else
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a replacing device"));
@@ -2010,7 +2567,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
"applicable to mirror and replacing vdevs"));
- (void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg);
+ (void) zfs_error(hdl, EZFS_BADTARGET, msg);
break;
case EBUSY:
@@ -2028,6 +2585,258 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
}
/*
+ * Find a mirror vdev in the source nvlist.
+ *
+ * The mchild array contains a list of disks in one of the top-level mirrors
+ * of the source pool. The schild array contains a list of disks that the
+ * user specified on the command line. We loop over the mchild array to
+ * see if any entry in the schild array matches.
+ *
+ * If a disk in the mchild array is found in the schild array, we return
+ * the index of that entry. Otherwise we return -1.
+ */
+static int
+find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
+ nvlist_t **schild, uint_t schildren)
+{
+ uint_t mc;
+
+ for (mc = 0; mc < mchildren; mc++) {
+ uint_t sc;
+ char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+ mchild[mc], B_FALSE);
+
+ for (sc = 0; sc < schildren; sc++) {
+ char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+ schild[sc], B_FALSE);
+ boolean_t result = (strcmp(mpath, spath) == 0);
+
+ free(spath);
+ if (result) {
+ free(mpath);
+ return (mc);
+ }
+ }
+
+ free(mpath);
+ }
+
+ return (-1);
+}
+
+/*
+ * Split a mirror pool. If newroot points to null, then a new nvlist
+ * is generated and it is the responsibility of the caller to free it.
+ */
+int
+zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
+ nvlist_t *props, splitflags_t flags)
+{
+ zfs_cmd_t zc = { 0 };
+ char msg[1024];
+ nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
+ nvlist_t **varray = NULL, *zc_props = NULL;
+ uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+ uint64_t vers;
+ boolean_t freelist = B_FALSE, memory_err = B_TRUE;
+ int retval = 0;
+
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
+
+ if (!zpool_name_valid(hdl, B_FALSE, newname))
+ return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ (void) fprintf(stderr, gettext("Internal error: unable to "
+ "retrieve pool configuration\n"));
+ return (-1);
+ }
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
+ == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
+
+ if (props) {
+ prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
+ if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
+ props, vers, flags, msg)) == NULL)
+ return (-1);
+ }
+
+ if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "Source pool is missing vdev tree"));
+ if (zc_props)
+ nvlist_free(zc_props);
+ return (-1);
+ }
+
+ varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
+ vcount = 0;
+
+ if (*newroot == NULL ||
+ nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
+ &newchild, &newchildren) != 0)
+ newchildren = 0;
+
+ for (c = 0; c < children; c++) {
+ uint64_t is_log = B_FALSE, is_hole = B_FALSE;
+ char *type;
+ nvlist_t **mchild, *vdev;
+ uint_t mchildren;
+ int entry;
+
+ /*
+ * Unlike cache & spares, slogs are stored in the
+ * ZPOOL_CONFIG_CHILDREN array. We filter them out here.
+ */
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+ if (is_log || is_hole) {
+ /*
+ * Create a hole vdev and put it in the config.
+ */
+ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
+ goto out;
+ if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) != 0)
+ goto out;
+ if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
+ 1) != 0)
+ goto out;
+ if (lastlog == 0)
+ lastlog = vcount;
+ varray[vcount++] = vdev;
+ continue;
+ }
+ lastlog = 0;
+ verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
+ == 0);
+ if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "Source pool must be composed only of mirrors\n"));
+ retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+ goto out;
+ }
+
+ verify(nvlist_lookup_nvlist_array(child[c],
+ ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+
+ /* find or add an entry for this top-level vdev */
+ if (newchildren > 0 &&
+ (entry = find_vdev_entry(zhp, mchild, mchildren,
+ newchild, newchildren)) >= 0) {
+ /* We found a disk that the user specified. */
+ vdev = mchild[entry];
+ ++found;
+ } else {
+ /* User didn't specify a disk for this vdev. */
+ vdev = mchild[mchildren - 1];
+ }
+
+ if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
+ goto out;
+ }
+
+ /* did we find every disk the user specified? */
+ if (found != newchildren) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
+ "include at most one disk from each mirror"));
+ retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+ goto out;
+ }
+
+ /* Prepare the nvlist for populating. */
+ if (*newroot == NULL) {
+ if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
+ goto out;
+ freelist = B_TRUE;
+ if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) != 0)
+ goto out;
+ } else {
+ verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
+ }
+
+ /* Add all the children we found */
+ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
+ lastlog == 0 ? vcount : lastlog) != 0)
+ goto out;
+
+ /*
+ * If we're just doing a dry run, exit now with success.
+ */
+ if (flags.dryrun) {
+ memory_err = B_FALSE;
+ freelist = B_FALSE;
+ goto out;
+ }
+
+ /* now build up the config list & call the ioctl */
+ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
+ goto out;
+
+ if (nvlist_add_nvlist(newconfig,
+ ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
+ nvlist_add_string(newconfig,
+ ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
+ nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
+ goto out;
+
+ /*
+ * The new pool is automatically part of the namespace unless we
+ * explicitly export it.
+ */
+ if (!flags.import)
+ zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
+ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+ (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
+ if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
+ goto out;
+ if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
+ goto out;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
+ retval = zpool_standard_error(hdl, errno, msg);
+ goto out;
+ }
+
+ freelist = B_FALSE;
+ memory_err = B_FALSE;
+
+out:
+ if (varray != NULL) {
+ int v;
+
+ for (v = 0; v < vcount; v++)
+ nvlist_free(varray[v]);
+ free(varray);
+ }
+ zcmd_free_nvlists(&zc);
+ if (zc_props)
+ nvlist_free(zc_props);
+ if (newconfig)
+ nvlist_free(newconfig);
+ if (freelist) {
+ nvlist_free(*newroot);
+ *newroot = NULL;
+ }
+
+ if (retval != 0)
+ return (retval);
+
+ if (memory_err)
+ return (no_memory(hdl));
+
+ return (0);
+}
+
+/*
* Remove the given device. Currently, this is supported only for hot spares
* and level 2 cache devices.
*/
@@ -2037,24 +2846,34 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare, l2cache;
+ boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl;
+ uint64_t version;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
- NULL)) == 0)
+ &islog)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
-
- if (!avail_spare && !l2cache) {
+ /*
+ * XXX - this should just go away.
+ */
+ if (!avail_spare && !l2cache && !islog) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "only inactive hot spares or cache devices "
- "can be removed"));
+ "only inactive hot spares, cache, top-level, "
+ "or log devices can be removed"));
return (zfs_error(hdl, EZFS_NODEVICE, msg));
}
+ version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+ if (islog && version < SPA_VERSION_HOLES) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgrade to support log removal"));
+ return (zfs_error(hdl, EZFS_BADVERSION, msg));
+ }
+
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
@@ -2067,13 +2886,16 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
* Clear the errors for the pool, or the particular device if specified.
*/
int
-zpool_clear(zpool_handle_t *zhp, const char *path)
+zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
+ zpool_rewind_policy_t policy;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
+ nvlist_t *nvi = NULL;
+ int error;
if (path)
(void) snprintf(msg, sizeof (msg),
@@ -2101,9 +2923,38 @@ zpool_clear(zpool_handle_t *zhp, const char *path)
&zc.zc_guid) == 0);
}
- if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0)
+ zpool_get_rewind_policy(rewindnvl, &policy);
+ zc.zc_cookie = policy.zrp_request;
+
+ if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
+ return (-1);
+
+ if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
+ return (-1);
+
+ while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
+ errno == ENOMEM) {
+ if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+ zcmd_free_nvlists(&zc);
+ return (-1);
+ }
+ }
+
+ if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+ errno != EPERM && errno != EACCES)) {
+ if (policy.zrp_request &
+ (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+ (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+ zpool_rewind_exclaim(hdl, zc.zc_name,
+ ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+ nvi);
+ nvlist_free(nvi);
+ }
+ zcmd_free_nvlists(&zc);
return (0);
+ }
+ zcmd_free_nvlists(&zc);
return (zpool_standard_error(hdl, errno, msg));
}
@@ -2123,6 +2974,7 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
+ zc.zc_cookie = ZPOOL_NO_REWIND;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
return (0);
@@ -2131,173 +2983,6 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
}
/*
- * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/<pool>
- * hierarchy.
- */
-int
-zpool_iter_zvol(zpool_handle_t *zhp, int (*cb)(const char *, void *),
- void *data)
-{
- libzfs_handle_t *hdl = zhp->zpool_hdl;
- char (*paths)[MAXPATHLEN];
- char path[MAXPATHLEN];
- size_t size = 4;
- int curr, fd, base, ret = 0;
- DIR *dirp;
- struct dirent *dp;
- struct stat st;
-
- if ((base = open(ZVOL_FULL_DEV_DIR, O_RDONLY)) < 0)
- return (errno == ENOENT ? 0 : -1);
-
- snprintf(path, sizeof(path), "%s/%s", ZVOL_FULL_DEV_DIR,
- zhp->zpool_name);
- if (stat(path, &st) != 0) {
- int err = errno;
- (void) close(base);
- return (err == ENOENT ? 0 : -1);
- }
-
- /*
- * Oddly this wasn't a directory -- ignore that failure since we
- * know there are no links lower in the (non-existant) hierarchy.
- */
- if (!S_ISDIR(st.st_mode)) {
- (void) close(base);
- return (0);
- }
-
- if ((paths = zfs_alloc(hdl, size * sizeof (paths[0]))) == NULL) {
- (void) close(base);
- return (-1);
- }
-
- (void) strlcpy(paths[0], zhp->zpool_name, sizeof (paths[0]));
- curr = 0;
-
- while (curr >= 0) {
- snprintf(path, sizeof(path), "%s/%s", ZVOL_FULL_DEV_DIR,
- paths[curr]);
- if (lstat(path, &st) != 0)
- goto err;
-
- if (S_ISDIR(st.st_mode)) {
- if ((dirp = opendir(path)) == NULL) {
- goto err;
- }
-
- while ((dp = readdir(dirp)) != NULL) {
- if (dp->d_name[0] == '.')
- continue;
-
- if (curr + 1 == size) {
- paths = zfs_realloc(hdl, paths,
- size * sizeof (paths[0]),
- size * 2 * sizeof (paths[0]));
- if (paths == NULL) {
- (void) closedir(dirp);
- goto err;
- }
-
- size *= 2;
- }
-
- (void) strlcpy(paths[curr + 1], paths[curr],
- sizeof (paths[curr + 1]));
- (void) strlcat(paths[curr], "/",
- sizeof (paths[curr]));
- (void) strlcat(paths[curr], dp->d_name,
- sizeof (paths[curr]));
- curr++;
- }
-
- (void) closedir(dirp);
-
- } else {
- if ((ret = cb(paths[curr], data)) != 0)
- break;
- }
-
- curr--;
- }
-
- free(paths);
- (void) close(base);
-
- return (ret);
-
-err:
- free(paths);
- (void) close(base);
- return (-1);
-}
-
-typedef struct zvol_cb {
- zpool_handle_t *zcb_pool;
- boolean_t zcb_create;
-} zvol_cb_t;
-
-/*ARGSUSED*/
-static int
-do_zvol_create(zfs_handle_t *zhp, void *data)
-{
- int ret = 0;
-
- if (ZFS_IS_VOLUME(zhp)) {
- (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
- ret = zfs_iter_snapshots(zhp, do_zvol_create, NULL);
- }
-
- if (ret == 0)
- ret = zfs_iter_filesystems(zhp, do_zvol_create, NULL);
-
- zfs_close(zhp);
-
- return (ret);
-}
-
-/*
- * Iterate over all zvols in the pool and make any necessary minor nodes.
- */
-int
-zpool_create_zvol_links(zpool_handle_t *zhp)
-{
- zfs_handle_t *zfp;
- int ret;
-
- /*
- * If the pool is unavailable, just return success.
- */
- if ((zfp = make_dataset_handle(zhp->zpool_hdl,
- zhp->zpool_name)) == NULL)
- return (0);
-
- ret = zfs_iter_filesystems(zfp, do_zvol_create, NULL);
-
- zfs_close(zfp);
- return (ret);
-}
-
-static int
-do_zvol_remove(const char *dataset, void *data)
-{
- zpool_handle_t *zhp = data;
-
- return (zvol_remove_link(zhp->zpool_hdl, dataset));
-}
-
-/*
- * Iterate over all zvols in the pool and remove any minor nodes. We iterate
- * by examining the /dev links so that a corrupted pool doesn't impede this
- * operation.
- */
-int
-zpool_remove_zvol_links(zpool_handle_t *zhp)
-{
- return (zpool_iter_zvol(zhp, do_zvol_remove, zhp));
-}
-
-/*
* Convert from a devid string to a path.
*/
static char *
@@ -2389,7 +3074,8 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
* of these checks.
*/
char *
-zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
+zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
+ boolean_t verbose)
{
char *path, *devid;
uint64_t value;
@@ -2412,7 +3098,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
* open a misbehaving device, which can have undesirable
* effects.
*/
- if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) != 0 ||
vs->vs_state >= VDEV_STATE_DEGRADED) &&
zhp != NULL &&
@@ -2444,17 +3130,35 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
devid_str_free(newdevid);
}
- if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
- path += sizeof(_PATH_DEV) - 1;
+#ifdef sun
+ if (strncmp(path, "/dev/dsk/", 9) == 0)
+ path += 9;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&value) == 0 && value) {
+ int pathlen = strlen(path);
char *tmp = zfs_strdup(hdl, path);
- if (tmp == NULL)
- return (NULL);
- tmp[strlen(path) - 2] = '\0';
+
+ /*
+ * If it starts with c#, and ends with "s0", chop
+ * the "s0" off, or if it ends with "s0/old", remove
+ * the "s0" from the middle.
+ */
+ if (CTD_CHECK(tmp)) {
+ if (strcmp(&tmp[pathlen - 2], "s0") == 0) {
+ tmp[pathlen - 2] = '\0';
+ } else if (pathlen > 6 &&
+ strcmp(&tmp[pathlen - 6], "s0/old") == 0) {
+ (void) strcpy(&tmp[pathlen - 6],
+ "/old");
+ }
+ }
return (tmp);
}
+#else /* !sun */
+ if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+ path += sizeof(_PATH_DEV) - 1;
+#endif /* !sun */
} else {
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
@@ -2468,6 +3172,20 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
(u_longlong_t)value);
path = buf;
}
+
+ /*
+ * We identify each top-level vdev by using a <type-id>
+ * naming convention.
+ */
+ if (verbose) {
+ uint64_t id;
+
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &id) == 0);
+ (void) snprintf(buf, sizeof (buf), "%s-%llu", path,
+ (u_longlong_t)id);
+ path = buf;
+ }
}
return (zfs_strdup(hdl, path));
@@ -2686,7 +3404,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
* into 'records'. 'leftover' is set to the number of bytes that weren't
* processed as there wasn't a complete record.
*/
-static int
+int
zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
nvlist_t ***records, uint_t *numrecords)
{
@@ -2815,15 +3533,7 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
free(mntpnt);
}
-#define RDISK_ROOT "/dev/rdsk"
-#define BACKUP_SLICE "s2"
-/*
- * Don't start the slice at the default block of 34; many storage
- * devices will use a stripe width of 128k, so start there instead.
- */
-#define NEW_START_BLOCK 256
-
-#if defined(sun)
+#ifdef sun
/*
* Read the EFI label from the config, if a label does not exist then
* pass back the error to the caller. If the caller has passed a non-NULL
@@ -2897,7 +3607,7 @@ find_start_block(nvlist_t *config)
int
zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
{
-#if defined(sun)
+#ifdef sun
char path[MAXPATHLEN];
struct dk_gpt *vtoc;
int fd;
@@ -3017,6 +3727,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(type, VDEV_TYPE_FILE) == 0 ||
strcmp(type, VDEV_TYPE_LOG) == 0 ||
+ strcmp(type, VDEV_TYPE_HOLE) == 0 ||
strcmp(type, VDEV_TYPE_MISSING) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"vdev type '%s' is not supported"), type);
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
index cdde90a89800..9d3c9845078c 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -36,24 +35,396 @@
#include <fcntl.h>
#include <sys/param.h>
#include <sys/mount.h>
-#include <sys/mntent.h>
-#include <sys/mnttab.h>
-#include <sys/avl.h>
-#include <stddef.h>
+#include <pthread.h>
+#include <umem.h>
#include <libzfs.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
+#include "zfs_fletcher.h"
#include "libzfs_impl.h"
+#include <sha2.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
-#include <fletcher.c> /* XXX */
-
+/* in libzfs_dataset.c */
+extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
/* We need to use something for ENODATA. */
#define ENODATA EIDRM
static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
- int, avl_tree_t *, char **);
+ int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *);
+
+static const zio_cksum_t zero_cksum = { 0 };
+
+typedef struct dedup_arg {
+ int inputfd;
+ int outputfd;
+ libzfs_handle_t *dedup_hdl;
+} dedup_arg_t;
+
+typedef struct dataref {
+ uint64_t ref_guid;
+ uint64_t ref_object;
+ uint64_t ref_offset;
+} dataref_t;
+
+typedef struct dedup_entry {
+ struct dedup_entry *dde_next;
+ zio_cksum_t dde_chksum;
+ uint64_t dde_prop;
+ dataref_t dde_ref;
+} dedup_entry_t;
+
+#define MAX_DDT_PHYSMEM_PERCENT 20
+#define SMALLEST_POSSIBLE_MAX_DDT_MB 128
+
+typedef struct dedup_table {
+ dedup_entry_t **dedup_hash_array;
+ umem_cache_t *ddecache;
+ uint64_t max_ddt_size; /* max dedup table size in bytes */
+ uint64_t cur_ddt_size; /* current dedup table size in bytes */
+ uint64_t ddt_count;
+ int numhashbits;
+ boolean_t ddt_full;
+} dedup_table_t;
+
+static int
+high_order_bit(uint64_t n)
+{
+ int count;
+
+ for (count = 0; n != 0; count++)
+ n >>= 1;
+ return (count);
+}
+
+static size_t
+ssread(void *buf, size_t len, FILE *stream)
+{
+ size_t outlen;
+
+ if ((outlen = fread(buf, len, 1, stream)) == 0)
+ return (0);
+
+ return (outlen);
+}
+
+static void
+ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
+ zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
+{
+ dedup_entry_t *dde;
+
+ if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
+ if (ddt->ddt_full == B_FALSE) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "Dedup table full. Deduplication will continue "
+ "with existing table entries"));
+ ddt->ddt_full = B_TRUE;
+ }
+ return;
+ }
+
+ if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
+ != NULL) {
+ assert(*ddepp == NULL);
+ dde->dde_next = NULL;
+ dde->dde_chksum = *cs;
+ dde->dde_prop = prop;
+ dde->dde_ref = *dr;
+ *ddepp = dde;
+ ddt->cur_ddt_size += sizeof (dedup_entry_t);
+ ddt->ddt_count++;
+ }
+}
+
+/*
+ * Using the specified dedup table, do a lookup for an entry with
+ * the checksum cs. If found, return the block's reference info
+ * in *dr. Otherwise, insert a new entry in the dedup table, using
+ * the reference information specified by *dr.
+ *
+ * return value: true - entry was found
+ * false - entry was not found
+ */
+static boolean_t
+ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
+ uint64_t prop, dataref_t *dr)
+{
+ uint32_t hashcode;
+ dedup_entry_t **ddepp;
+
+ hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
+
+ for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
+ ddepp = &((*ddepp)->dde_next)) {
+ if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
+ (*ddepp)->dde_prop == prop) {
+ *dr = (*ddepp)->dde_ref;
+ return (B_TRUE);
+ }
+ }
+ ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
+ return (B_FALSE);
+}
+
+static int
+cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
+{
+ fletcher_4_incremental_native(buf, len, zc);
+ return (write(outfd, buf, len));
+}
+
+/*
+ * This function is started in a separate thread when the dedup option
+ * has been requested. The main send thread determines the list of
+ * snapshots to be included in the send stream and makes the ioctl calls
+ * for each one. But instead of having the ioctl send the output to the
+ * the output fd specified by the caller of zfs_send()), the
+ * ioctl is told to direct the output to a pipe, which is read by the
+ * alternate thread running THIS function. This function does the
+ * dedup'ing by:
+ * 1. building a dedup table (the DDT)
+ * 2. doing checksums on each data block and inserting a record in the DDT
+ * 3. looking for matching checksums, and
+ * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever
+ * a duplicate block is found.
+ * The output of this function then goes to the output fd requested
+ * by the caller of zfs_send().
+ */
+static void *
+cksummer(void *arg)
+{
+ dedup_arg_t *dda = arg;
+ char *buf = malloc(1<<20);
+ dmu_replay_record_t thedrr;
+ dmu_replay_record_t *drr = &thedrr;
+ struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+ struct drr_end *drre = &thedrr.drr_u.drr_end;
+ struct drr_object *drro = &thedrr.drr_u.drr_object;
+ struct drr_write *drrw = &thedrr.drr_u.drr_write;
+ struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+ FILE *ofp;
+ int outfd;
+ dmu_replay_record_t wbr_drr = {0};
+ struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
+ dedup_table_t ddt;
+ zio_cksum_t stream_cksum;
+ uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
+ uint64_t numbuckets;
+
+ ddt.max_ddt_size =
+ MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
+ SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
+
+ numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
+
+ /*
+ * numbuckets must be a power of 2. Increase number to
+ * a power of 2 if necessary.
+ */
+ if (!ISP2(numbuckets))
+ numbuckets = 1 << high_order_bit(numbuckets);
+
+ ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
+ ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
+ NULL, NULL, NULL, NULL, NULL, 0);
+ ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
+ ddt.numhashbits = high_order_bit(numbuckets) - 1;
+ ddt.ddt_full = B_FALSE;
+
+ /* Initialize the write-by-reference block. */
+ wbr_drr.drr_type = DRR_WRITE_BYREF;
+ wbr_drr.drr_payloadlen = 0;
+
+ outfd = dda->outputfd;
+ ofp = fdopen(dda->inputfd, "r");
+ while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
+
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ {
+ int fflags;
+ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+
+ /* set the DEDUP feature flag for this stream */
+ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ fflags |= (DMU_BACKUP_FEATURE_DEDUP |
+ DMU_BACKUP_FEATURE_DEDUPPROPS);
+ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
+
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
+ int sz = drr->drr_payloadlen;
+
+ if (sz > 1<<20) {
+ free(buf);
+ buf = malloc(sz);
+ }
+ (void) ssread(buf, sz, ofp);
+ if (ferror(stdin))
+ perror("fread");
+ if (cksum_and_write(buf, sz, &stream_cksum,
+ outfd) == -1)
+ goto out;
+ }
+ break;
+ }
+
+ case DRR_END:
+ {
+ /* use the recalculated checksum */
+ ZIO_SET_CHECKSUM(&drre->drr_checksum,
+ stream_cksum.zc_word[0], stream_cksum.zc_word[1],
+ stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
+ if ((write(outfd, drr,
+ sizeof (dmu_replay_record_t))) == -1)
+ goto out;
+ break;
+ }
+
+ case DRR_OBJECT:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ if (drro->drr_bonuslen > 0) {
+ (void) ssread(buf,
+ P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+ ofp);
+ if (cksum_and_write(buf,
+ P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ }
+ break;
+ }
+
+ case DRR_SPILL:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ (void) ssread(buf, drrs->drr_length, ofp);
+ if (cksum_and_write(buf, drrs->drr_length,
+ &stream_cksum, outfd) == -1)
+ goto out;
+ break;
+ }
+
+ case DRR_FREEOBJECTS:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ break;
+ }
+
+ case DRR_WRITE:
+ {
+ dataref_t dataref;
+
+ (void) ssread(buf, drrw->drr_length, ofp);
+
+ /*
+ * Use the existing checksum if it's dedup-capable,
+ * else calculate a SHA256 checksum for it.
+ */
+
+ if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
+ zero_cksum) ||
+ !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
+ SHA256_CTX ctx;
+ zio_cksum_t tmpsha256;
+
+ SHA256Init(&ctx);
+ SHA256Update(&ctx, buf, drrw->drr_length);
+ SHA256Final(&tmpsha256, &ctx);
+ drrw->drr_key.ddk_cksum.zc_word[0] =
+ BE_64(tmpsha256.zc_word[0]);
+ drrw->drr_key.ddk_cksum.zc_word[1] =
+ BE_64(tmpsha256.zc_word[1]);
+ drrw->drr_key.ddk_cksum.zc_word[2] =
+ BE_64(tmpsha256.zc_word[2]);
+ drrw->drr_key.ddk_cksum.zc_word[3] =
+ BE_64(tmpsha256.zc_word[3]);
+ drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
+ drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
+ }
+
+ dataref.ref_guid = drrw->drr_toguid;
+ dataref.ref_object = drrw->drr_object;
+ dataref.ref_offset = drrw->drr_offset;
+
+ if (ddt_update(dda->dedup_hdl, &ddt,
+ &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
+ &dataref)) {
+ /* block already present in stream */
+ wbr_drrr->drr_object = drrw->drr_object;
+ wbr_drrr->drr_offset = drrw->drr_offset;
+ wbr_drrr->drr_length = drrw->drr_length;
+ wbr_drrr->drr_toguid = drrw->drr_toguid;
+ wbr_drrr->drr_refguid = dataref.ref_guid;
+ wbr_drrr->drr_refobject =
+ dataref.ref_object;
+ wbr_drrr->drr_refoffset =
+ dataref.ref_offset;
+
+ wbr_drrr->drr_checksumtype =
+ drrw->drr_checksumtype;
+ wbr_drrr->drr_checksumflags =
+ drrw->drr_checksumtype;
+ wbr_drrr->drr_key.ddk_cksum =
+ drrw->drr_key.ddk_cksum;
+ wbr_drrr->drr_key.ddk_prop =
+ drrw->drr_key.ddk_prop;
+
+ if (cksum_and_write(&wbr_drr,
+ sizeof (dmu_replay_record_t), &stream_cksum,
+ outfd) == -1)
+ goto out;
+ } else {
+ /* block not previously seen */
+ if (cksum_and_write(drr,
+ sizeof (dmu_replay_record_t), &stream_cksum,
+ outfd) == -1)
+ goto out;
+ if (cksum_and_write(buf,
+ drrw->drr_length,
+ &stream_cksum, outfd) == -1)
+ goto out;
+ }
+ break;
+ }
+
+ case DRR_FREE:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ break;
+ }
+
+ default:
+ (void) printf("INVALID record type 0x%x\n",
+ drr->drr_type);
+ /* should never happen, so assert */
+ assert(B_FALSE);
+ }
+ }
+out:
+ umem_cache_destroy(ddt.ddecache);
+ free(ddt.dedup_hash_array);
+ free(buf);
+ (void) fclose(ofp);
+
+ return (NULL);
+}
/*
* Routines for dealing with the AVL tree of fs-nvlists
@@ -116,6 +487,9 @@ fsavl_destroy(avl_tree_t *avl)
free(avl);
}
+/*
+ * Given an nvlist, produce an avl tree of snapshots, ordered by guid
+ */
static avl_tree_t *
fsavl_create(nvlist_t *fss)
{
@@ -173,6 +547,7 @@ typedef struct send_data {
nvlist_t *snapprops;
const char *fromsnap;
const char *tosnap;
+ boolean_t recursive;
/*
* The header nvlist is of the following format:
@@ -240,25 +615,50 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
zfs_prop_t prop = zfs_name_to_prop(propname);
nvlist_t *propnv;
- assert(zfs_prop_user(propname) || prop != ZPROP_INVAL);
+ if (!zfs_prop_user(propname)) {
+ /*
+ * Realistically, this should never happen. However,
+ * we want the ability to add DSL properties without
+ * needing to make incompatible version changes. We
+ * need to ignore unknown properties to allow older
+ * software to still send datasets containing these
+ * properties, with the unknown properties elided.
+ */
+ if (prop == ZPROP_INVAL)
+ continue;
- if (!zfs_prop_user(propname) && zfs_prop_readonly(prop))
- continue;
+ if (zfs_prop_readonly(prop))
+ continue;
+ }
verify(nvpair_value_nvlist(elem, &propnv) == 0);
- if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) {
- /* these guys are modifyable, but have no source */
+ if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
+ prop == ZFS_PROP_REFQUOTA ||
+ prop == ZFS_PROP_REFRESERVATION) {
+ char *source;
uint64_t value;
verify(nvlist_lookup_uint64(propnv,
ZPROP_VALUE, &value) == 0);
if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
continue;
+ /*
+ * May have no source before SPA_VERSION_RECVD_PROPS,
+ * but is still modifiable.
+ */
+ if (nvlist_lookup_string(propnv,
+ ZPROP_SOURCE, &source) == 0) {
+ if ((strcmp(source, zhp->zfs_name) != 0) &&
+ (strcmp(source,
+ ZPROP_SOURCE_VAL_RECVD) != 0))
+ continue;
+ }
} else {
char *source;
if (nvlist_lookup_string(propnv,
ZPROP_SOURCE, &source) != 0)
continue;
- if (strcmp(source, zhp->zfs_name) != 0)
+ if ((strcmp(source, zhp->zfs_name) != 0) &&
+ (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
continue;
}
@@ -277,12 +677,17 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
}
}
+/*
+ * recursively generate nvlists describing datasets. See comment
+ * for the data structure send_data_t above for description of contents
+ * of the nvlist.
+ */
static int
send_iterate_fs(zfs_handle_t *zhp, void *arg)
{
send_data_t *sd = arg;
nvlist_t *nvfs, *nv;
- int rv;
+ int rv = 0;
uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
uint64_t guid = zhp->zfs_dmustats.dds_guid;
char guidstring[64];
@@ -324,7 +729,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
nvlist_free(nvfs);
/* iterate over children */
- rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
+ if (sd->recursive)
+ rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
@@ -334,7 +740,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
static int
gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
- const char *tosnap, nvlist_t **nvlp, avl_tree_t **avlp)
+ const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp)
{
zfs_handle_t *zhp;
send_data_t sd = { 0 };
@@ -347,6 +753,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
sd.fromsnap = fromsnap;
sd.tosnap = tosnap;
+ sd.recursive = recursive;
if ((error = send_iterate_fs(zhp, &sd)) != 0) {
nvlist_free(sd.fss);
@@ -378,14 +785,30 @@ static int
zfs_sort_snaps(zfs_handle_t *zhp, void *data)
{
avl_tree_t *avl = data;
- zfs_node_t *node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
+ zfs_node_t *node;
+ zfs_node_t search;
+
+ search.zn_handle = zhp;
+ node = avl_find(avl, &search, NULL);
+ if (node) {
+ /*
+ * If this snapshot was renamed while we were creating the
+ * AVL tree, it's possible that we already inserted it under
+ * its old name. Remove the old handle before adding the new
+ * one.
+ */
+ zfs_close(node->zn_handle);
+ avl_remove(avl, node);
+ free(node);
+ }
+ node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
node->zn_handle = zhp;
avl_add(avl, node);
+
return (0);
}
-/* ARGSUSED */
static int
zfs_snapshot_compare(const void *larg, const void *rarg)
{
@@ -408,7 +831,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg)
return (0);
}
-static int
+int
zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
{
int ret = 0;
@@ -439,13 +862,19 @@ typedef struct send_dump_data {
/* these are all just the short snapname (the part after the @) */
const char *fromsnap;
const char *tosnap;
- char lastsnap[ZFS_MAXNAMELEN];
+ char prevsnap[ZFS_MAXNAMELEN];
+ uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose;
int outfd;
boolean_t err;
nvlist_t *fss;
avl_tree_t *fsavl;
+ snapfilter_cb_t *filter_cb;
+ void *filter_cb_arg;
+ nvlist_t *debugnv;
+ char holdtag[ZFS_MAXNAMELEN];
+ int cleanup_fd;
} send_dump_data_t;
/*
@@ -453,26 +882,40 @@ typedef struct send_dump_data {
* NULL) to the file descriptor specified by outfd.
*/
static int
-dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
- int outfd)
+dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
+ boolean_t fromorigin, int outfd, nvlist_t *debugnv)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
+ nvlist_t *thisdbg;
assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
- assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
+ assert(fromsnap_obj == 0 || !fromorigin);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
- if (fromsnap)
- (void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_value));
zc.zc_cookie = outfd;
zc.zc_obj = fromorigin;
+ zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+ zc.zc_fromobj = fromsnap_obj;
+
+ VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
+ if (fromsnap && fromsnap[0] != '\0') {
+ VERIFY(0 == nvlist_add_string(thisdbg,
+ "fromsnap", fromsnap));
+ }
if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"warning: cannot send '%s'"), zhp->zfs_name);
+ VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
+ if (debugnv) {
+ VERIFY(0 == nvlist_add_nvlist(debugnv,
+ zhp->zfs_name, thisdbg));
+ }
+ nvlist_free(thisdbg);
+
switch (errno) {
case EXDEV:
@@ -507,24 +950,74 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
}
}
+ if (debugnv)
+ VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
+ nvlist_free(thisdbg);
+
return (0);
}
static int
+hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
+{
+ zfs_handle_t *pzhp;
+ int error = 0;
+ char *thissnap;
+
+ assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+ /*
+ * zfs_send() only opens a cleanup_fd for sends that need it,
+ * e.g. replication and doall.
+ */
+ if (sdd->cleanup_fd == -1)
+ return (0);
+
+ thissnap = strchr(zhp->zfs_name, '@') + 1;
+ *(thissnap - 1) = '\0';
+ pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET);
+ *(thissnap - 1) = '@';
+
+ /*
+ * It's OK if the parent no longer exists. The send code will
+ * handle that error.
+ */
+ if (pzhp) {
+ error = zfs_hold(pzhp, thissnap, sdd->holdtag,
+ B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd,
+ zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID),
+ zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG));
+ zfs_close(pzhp);
+ }
+
+ return (error);
+}
+
+static int
dump_snapshot(zfs_handle_t *zhp, void *arg)
{
send_dump_data_t *sdd = arg;
- const char *thissnap;
+ char *thissnap;
int err;
+ boolean_t isfromsnap, istosnap;
+ boolean_t exclude = B_FALSE;
thissnap = strchr(zhp->zfs_name, '@') + 1;
+ isfromsnap = (sdd->fromsnap != NULL &&
+ strcmp(sdd->fromsnap, thissnap) == 0);
- if (sdd->fromsnap && !sdd->seenfrom &&
- strcmp(sdd->fromsnap, thissnap) == 0) {
- sdd->seenfrom = B_TRUE;
- (void) strcpy(sdd->lastsnap, thissnap);
+ if (!sdd->seenfrom && isfromsnap) {
+ err = hold_for_send(zhp, sdd);
+ if (err == 0) {
+ sdd->seenfrom = B_TRUE;
+ (void) strcpy(sdd->prevsnap, thissnap);
+ sdd->prevsnap_obj = zfs_prop_get_int(zhp,
+ ZFS_PROP_OBJSETID);
+ } else if (err == ENOENT) {
+ err = 0;
+ }
zfs_close(zhp);
- return (0);
+ return (err);
}
if (sdd->seento || !sdd->seenfrom) {
@@ -532,20 +1025,69 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
return (0);
}
+ istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
+ if (istosnap)
+ sdd->seento = B_TRUE;
+
+ if (!sdd->doall && !isfromsnap && !istosnap) {
+ if (sdd->replicate) {
+ char *snapname;
+ nvlist_t *snapprops;
+ /*
+ * Filter out all intermediate snapshots except origin
+ * snapshots needed to replicate clones.
+ */
+ nvlist_t *nvfs = fsavl_find(sdd->fsavl,
+ zhp->zfs_dmustats.dds_guid, &snapname);
+
+ VERIFY(0 == nvlist_lookup_nvlist(nvfs,
+ "snapprops", &snapprops));
+ VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+ thissnap, &snapprops));
+ exclude = !nvlist_exists(snapprops, "is_clone_origin");
+ } else {
+ exclude = B_TRUE;
+ }
+ }
+
+ /*
+ * If a filter function exists, call it to determine whether
+ * this snapshot will be sent.
+ */
+ if (exclude || (sdd->filter_cb != NULL &&
+ sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
+ /*
+ * This snapshot is filtered out. Don't send it, and don't
+ * set prevsnap_obj, so it will be as if this snapshot didn't
+ * exist, and the next accepted snapshot will be sent as
+ * an incremental from the last accepted one, or as the
+ * first (and full) snapshot in the case of a replication,
+ * non-incremental send.
+ */
+ zfs_close(zhp);
+ return (0);
+ }
+
+ err = hold_for_send(zhp, sdd);
+ if (err) {
+ if (err == ENOENT)
+ err = 0;
+ zfs_close(zhp);
+ return (err);
+ }
+
/* send it */
if (sdd->verbose) {
(void) fprintf(stderr, "sending from @%s to %s\n",
- sdd->lastsnap, zhp->zfs_name);
+ sdd->prevsnap, zhp->zfs_name);
}
- err = dump_ioctl(zhp, sdd->lastsnap,
- sdd->lastsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
- sdd->outfd);
+ err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
+ sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
+ sdd->outfd, sdd->debugnv);
- if (!sdd->seento && strcmp(sdd->tosnap, thissnap) == 0)
- sdd->seento = B_TRUE;
-
- (void) strcpy(sdd->lastsnap, thissnap);
+ (void) strcpy(sdd->prevsnap, thissnap);
+ sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zfs_close(zhp);
return (err);
}
@@ -584,51 +1126,33 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
}
}
- if (sdd->doall) {
- sdd->seenfrom = sdd->seento = sdd->lastsnap[0] = 0;
- if (sdd->fromsnap == NULL || missingfrom)
- sdd->seenfrom = B_TRUE;
+ sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
+ sdd->prevsnap_obj = 0;
+ if (sdd->fromsnap == NULL || missingfrom)
+ sdd->seenfrom = B_TRUE;
- rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
- if (!sdd->seenfrom) {
+ rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
+ if (!sdd->seenfrom) {
+ (void) fprintf(stderr,
+ "WARNING: could not send %s@%s:\n"
+ "incremental source (%s@%s) does not exist\n",
+ zhp->zfs_name, sdd->tosnap,
+ zhp->zfs_name, sdd->fromsnap);
+ sdd->err = B_TRUE;
+ } else if (!sdd->seento) {
+ if (sdd->fromsnap) {
(void) fprintf(stderr,
"WARNING: could not send %s@%s:\n"
- "incremental source (%s@%s) does not exist\n",
+ "incremental source (%s@%s) "
+ "is not earlier than it\n",
zhp->zfs_name, sdd->tosnap,
zhp->zfs_name, sdd->fromsnap);
- sdd->err = B_TRUE;
- } else if (!sdd->seento) {
- if (sdd->fromsnap) {
- (void) fprintf(stderr,
- "WARNING: could not send %s@%s:\n"
- "incremental source (%s@%s) "
- "is not earlier than it\n",
- zhp->zfs_name, sdd->tosnap,
- zhp->zfs_name, sdd->fromsnap);
- } else {
- (void) fprintf(stderr, "WARNING: "
- "could not send %s@%s: does not exist\n",
- zhp->zfs_name, sdd->tosnap);
- }
- sdd->err = B_TRUE;
- }
- } else {
- zfs_handle_t *snapzhp;
- char snapname[ZFS_MAXNAMELEN];
-
- (void) snprintf(snapname, sizeof (snapname), "%s@%s",
- zfs_get_name(zhp), sdd->tosnap);
- snapzhp = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT);
- if (snapzhp == NULL) {
- rv = -1;
} else {
- rv = dump_ioctl(snapzhp,
- missingfrom ? NULL : sdd->fromsnap,
- sdd->fromorigin || missingfrom,
- sdd->outfd);
- sdd->seento = B_TRUE;
- zfs_close(snapzhp);
+ (void) fprintf(stderr, "WARNING: "
+ "could not send %s@%s: does not exist\n",
+ zhp->zfs_name, sdd->tosnap);
}
+ sdd->err = B_TRUE;
}
return (rv);
@@ -644,6 +1168,29 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg)
if (!sdd->replicate)
return (dump_filesystem(rzhp, sdd));
+ /* Mark the clone origin snapshots. */
+ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
+ fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
+ nvlist_t *nvfs;
+ uint64_t origin_guid = 0;
+
+ VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
+ (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
+ if (origin_guid != 0) {
+ char *snapname;
+ nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+ origin_guid, &snapname);
+ if (origin_nv != NULL) {
+ nvlist_t *snapprops;
+ VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
+ "snapprops", &snapprops));
+ VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+ snapname, &snapprops));
+ VERIFY(0 == nvlist_add_boolean(
+ snapprops, "is_clone_origin"));
+ }
+ }
+ }
again:
needagain = progress = B_FALSE;
for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
@@ -653,7 +1200,6 @@ again:
zfs_handle_t *zhp;
int err;
uint64_t origin_guid = 0;
- nvlist_t *origin_nv;
VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
if (nvlist_lookup_boolean(fslist, "sent") == 0)
@@ -662,15 +1208,19 @@ again:
VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
- origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL);
- if (origin_nv &&
- nvlist_lookup_boolean(origin_nv, "sent") == ENOENT) {
- /*
- * origin has not been sent yet;
- * skip this clone.
- */
- needagain = B_TRUE;
- continue;
+ if (origin_guid != 0) {
+ nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+ origin_guid, NULL);
+ if (origin_nv != NULL &&
+ nvlist_lookup_boolean(origin_nv,
+ "sent") == ENOENT) {
+ /*
+ * origin has not been sent yet;
+ * skip this clone.
+ */
+ needagain = B_TRUE;
+ continue;
+ }
}
zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
@@ -691,20 +1241,38 @@ again:
}
/*
- * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL.
- * If 'doall', dump all intermediate snaps.
- * If 'replicate', dump special header and do recursively.
+ * Generate a send stream for the dataset identified by the argument zhp.
+ *
+ * The content of the send stream is the snapshot identified by
+ * 'tosnap'. Incremental streams are requested in two ways:
+ * - from the snapshot identified by "fromsnap" (if non-null) or
+ * - from the origin of the dataset identified by zhp, which must
+ * be a clone. In this case, "fromsnap" is null and "fromorigin"
+ * is TRUE.
+ *
+ * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
+ * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
+ * if "replicate" is set. If "doall" is set, dump all the intermediate
+ * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
+ * case too. If "props" is set, send properties.
*/
int
zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
- boolean_t replicate, boolean_t doall, boolean_t fromorigin,
- boolean_t verbose, int outfd)
+ sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+ void *cb_arg, nvlist_t **debugnvp)
{
char errbuf[1024];
send_dump_data_t sdd = { 0 };
int err;
nvlist_t *fss = NULL;
avl_tree_t *fsavl = NULL;
+ static uint64_t holdseq;
+ int spa_version;
+ boolean_t holdsnaps = B_FALSE;
+ pthread_t tid;
+ int pipefd[2];
+ dedup_arg_t dda = { 0 };
+ int featureflags = 0;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot send '%s'"), zhp->zfs_name);
@@ -715,15 +1283,46 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
}
- if (replicate || doall) {
+ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
+ uint64_t version;
+ version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ if (version >= ZPL_VERSION_SA) {
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+ }
+ }
+
+ if (zfs_spa_version(zhp, &spa_version) == 0 &&
+ spa_version >= SPA_VERSION_USERREFS &&
+ (flags.doall || flags.replicate))
+ holdsnaps = B_TRUE;
+
+ if (flags.dedup) {
+ featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
+ DMU_BACKUP_FEATURE_DEDUPPROPS);
+ if (err = pipe(pipefd)) {
+ zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+ return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
+ errbuf));
+ }
+ dda.outputfd = outfd;
+ dda.inputfd = pipefd[1];
+ dda.dedup_hdl = zhp->zfs_hdl;
+ if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
+ (void) close(pipefd[0]);
+ (void) close(pipefd[1]);
+ zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+ return (zfs_error(zhp->zfs_hdl,
+ EZFS_THREADCREATEFAILED, errbuf));
+ }
+ }
+
+ if (flags.replicate || flags.doall || flags.props) {
dmu_replay_record_t drr = { 0 };
char *packbuf = NULL;
size_t buflen = 0;
zio_cksum_t zc = { 0 };
- assert(fromsnap || doall);
-
- if (replicate) {
+ if (flags.replicate || flags.props) {
nvlist_t *hdrnv;
VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
@@ -732,45 +1331,52 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
"fromsnap", fromsnap));
}
VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
+ if (!flags.replicate) {
+ VERIFY(0 == nvlist_add_boolean(hdrnv,
+ "not_recursive"));
+ }
err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
- fromsnap, tosnap, &fss, &fsavl);
+ fromsnap, tosnap, flags.replicate, &fss, &fsavl);
if (err)
- return (err);
+ goto err_out;
VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
err = nvlist_pack(hdrnv, &packbuf, &buflen,
NV_ENCODE_XDR, 0);
- nvlist_free(hdrnv);
+ if (debugnvp)
+ *debugnvp = hdrnv;
+ else
+ nvlist_free(hdrnv);
if (err) {
fsavl_destroy(fsavl);
nvlist_free(fss);
- return (zfs_standard_error(zhp->zfs_hdl,
- err, errbuf));
+ goto stderr_out;
}
}
/* write first begin record */
drr.drr_type = DRR_BEGIN;
drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr.drr_u.drr_begin.drr_version = DMU_BACKUP_HEADER_VERSION;
+ DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo,
+ DMU_COMPOUNDSTREAM);
+ DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo,
+ featureflags);
(void) snprintf(drr.drr_u.drr_begin.drr_toname,
sizeof (drr.drr_u.drr_begin.drr_toname),
"%s@%s", zhp->zfs_name, tosnap);
drr.drr_payloadlen = buflen;
- fletcher_4_incremental_native(&drr, sizeof (drr), &zc);
- err = write(outfd, &drr, sizeof (drr));
+ err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
/* write header nvlist */
- if (err != -1) {
- fletcher_4_incremental_native(packbuf, buflen, &zc);
- err = write(outfd, packbuf, buflen);
+ if (err != -1 && packbuf != NULL) {
+ err = cksum_and_write(packbuf, buflen, &zc, outfd);
}
free(packbuf);
if (err == -1) {
fsavl_destroy(fsavl);
nvlist_free(fss);
- return (zfs_standard_error(zhp->zfs_hdl,
- errno, errbuf));
+ err = errno;
+ goto stderr_out;
}
/* write end record */
@@ -782,8 +1388,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
if (err == -1) {
fsavl_destroy(fsavl);
nvlist_free(fss);
- return (zfs_standard_error(zhp->zfs_hdl,
- errno, errbuf));
+ err = errno;
+ goto stderr_out;
}
}
}
@@ -791,18 +1397,47 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
/* dump each stream */
sdd.fromsnap = fromsnap;
sdd.tosnap = tosnap;
- sdd.outfd = outfd;
- sdd.replicate = replicate;
- sdd.doall = doall;
- sdd.fromorigin = fromorigin;
+ if (flags.dedup)
+ sdd.outfd = pipefd[0];
+ else
+ sdd.outfd = outfd;
+ sdd.replicate = flags.replicate;
+ sdd.doall = flags.doall;
+ sdd.fromorigin = flags.fromorigin;
sdd.fss = fss;
sdd.fsavl = fsavl;
- sdd.verbose = verbose;
+ sdd.verbose = flags.verbose;
+ sdd.filter_cb = filter_func;
+ sdd.filter_cb_arg = cb_arg;
+ if (debugnvp)
+ sdd.debugnv = *debugnvp;
+ if (holdsnaps) {
+ ++holdseq;
+ (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
+ ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
+ sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+ if (sdd.cleanup_fd < 0) {
+ err = errno;
+ goto stderr_out;
+ }
+ } else {
+ sdd.cleanup_fd = -1;
+ }
err = dump_filesystems(zhp, &sdd);
fsavl_destroy(fsavl);
nvlist_free(fss);
- if (replicate || doall) {
+ if (flags.dedup) {
+ (void) close(pipefd[0]);
+ (void) pthread_join(tid, NULL);
+ }
+
+ if (sdd.cleanup_fd != -1) {
+ VERIFY(0 == close(sdd.cleanup_fd));
+ sdd.cleanup_fd = -1;
+ }
+
+ if (flags.replicate || flags.doall || flags.props) {
/*
* write final end record. NB: want to do this even if
* there was some error, because it might not be totally
@@ -817,6 +1452,18 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
}
return (err || sdd.err);
+
+stderr_out:
+ err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
+err_out:
+ if (sdd.cleanup_fd != -1)
+ VERIFY(0 == close(sdd.cleanup_fd));
+ if (flags.dedup) {
+ (void) pthread_cancel(tid);
+ (void) pthread_join(tid, NULL);
+ (void) close(pipefd[0]);
+ }
+ return (err);
}
/*
@@ -902,11 +1549,12 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
if (err)
return (err);
+ zc.zc_objset_type = DMU_OST_ZFS;
+ (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
if (tryname) {
(void) strcpy(newname, tryname);
- zc.zc_objset_type = DMU_OST_ZFS;
- (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
if (flags.verbose) {
@@ -961,12 +1609,18 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
int err = 0;
prop_changelist_t *clp;
zfs_handle_t *zhp;
+ boolean_t defer = B_FALSE;
+ int spa_version;
zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
if (zhp == NULL)
return (-1);
clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
flags.force ? MS_FORCE : 0);
+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
+ zfs_spa_version(zhp, &spa_version) == 0 &&
+ spa_version >= SPA_VERSION_USERREFS)
+ defer = B_TRUE;
zfs_close(zhp);
if (clp == NULL)
return (-1);
@@ -975,12 +1629,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
return (err);
zc.zc_objset_type = DMU_OST_ZFS;
+ zc.zc_defer_destroy = defer;
(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
if (flags.verbose)
(void) printf("attempting destroy %s\n", zc.zc_name);
err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
-
if (err == 0) {
if (flags.verbose)
(void) printf("success\n");
@@ -990,8 +1644,14 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
(void) changelist_postfix(clp);
changelist_free(clp);
- if (err != 0)
+ /*
+ * Deferred destroy might destroy the snapshot or only mark it to be
+ * destroyed later, and it returns success in either case.
+ */
+ if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
+ ZFS_TYPE_SNAPSHOT))) {
err = recv_rename(hdl, name, NULL, baselen, newname, flags);
+ }
return (err);
}
@@ -1009,6 +1669,7 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg)
if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
(void) strcpy(gtnd->name, zhp->zfs_name);
+ zfs_close(zhp);
return (EEXIST);
}
err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
@@ -1099,19 +1760,22 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
static int
recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
- recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl)
+ recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
+ nvlist_t *renamed)
{
nvlist_t *local_nv;
avl_tree_t *local_avl;
nvpair_t *fselem, *nextfselem;
- char *tosnap, *fromsnap;
+ char *fromsnap;
char newname[ZFS_MAXNAMELEN];
int error;
- boolean_t needagain, progress;
+ boolean_t needagain, progress, recursive;
char *s1, *s2;
VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
- VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap));
+
+ recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+ ENOENT);
if (flags.dryrun)
return (0);
@@ -1120,7 +1784,7 @@ again:
needagain = progress = B_FALSE;
if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
- &local_nv, &local_avl)) != 0)
+ recursive, &local_nv, &local_avl)) != 0)
return (error);
/*
@@ -1135,7 +1799,7 @@ again:
uint64_t originguid = 0;
uint64_t stream_originguid = 0;
uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
- char *fsname, *stream_fsname, *p1, *p2;
+ char *fsname, *stream_fsname;
nextfselem = nvlist_next_nvpair(local_nv, fselem);
@@ -1243,7 +1907,7 @@ again:
stream_snapname, &props)) {
zfs_cmd_t zc = { 0 };
- zc.zc_cookie = B_TRUE; /* clear current props */
+ zc.zc_cookie = B_TRUE; /* received */
(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
"%s@%s", fsname, nvpair_name(snapelem));
if (zcmd_write_src_nvlist(hdl, &zc,
@@ -1291,10 +1955,13 @@ again:
continue;
}
- if (fromguid == 0 && flags.verbose) {
- (void) printf("local fs %s does not have fromsnap "
- "(%s in stream); must have been deleted locally; "
- "ignoring\n", fsname, fromsnap);
+ if (fromguid == 0) {
+ if (flags.verbose) {
+ (void) printf("local fs %s does not have "
+ "fromsnap (%s in stream); must have "
+ "been deleted locally; ignoring\n",
+ fsname, fromsnap);
+ }
continue;
}
@@ -1306,10 +1973,16 @@ again:
s1 = strrchr(fsname, '/');
s2 = strrchr(stream_fsname, '/');
- /* check for rename */
+ /*
+ * Check for rename. If the exact receive path is specified, it
+ * does not count as a rename, but we still need to check the
+ * datasets beneath it.
+ */
if ((stream_parent_fromsnap_guid != 0 &&
+ parent_fromsnap_guid != 0 &&
stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
- ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
+ ((flags.isprefix || strcmp(tofs, fsname) != 0) &&
+ (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
nvlist_t *parent;
char tryname[ZFS_MAXNAMELEN];
@@ -1328,7 +2001,7 @@ again:
VERIFY(0 == nvlist_lookup_string(parent, "name",
&pname));
(void) snprintf(tryname, sizeof (tryname),
- "%s%s", pname, p2 != NULL ? p2 : "");
+ "%s%s", pname, strrchr(stream_fsname, '/'));
} else {
tryname[0] = '\0';
if (flags.verbose) {
@@ -1337,8 +2010,16 @@ again:
}
}
+ newname[0] = '\0';
+
error = recv_rename(hdl, fsname, tryname,
strlen(tofs)+1, newname, flags);
+
+ if (renamed != NULL && newname[0] != '\0') {
+ VERIFY(0 == nvlist_add_boolean(renamed,
+ newname));
+ }
+
if (error)
needagain = B_TRUE;
else
@@ -1362,42 +2043,33 @@ again:
static int
zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
- char **top_zfs)
+ char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
{
nvlist_t *stream_nv = NULL;
avl_tree_t *stream_avl = NULL;
char *fromsnap = NULL;
+ char *cp;
char tofs[ZFS_MAXNAMELEN];
+ char sendfs[ZFS_MAXNAMELEN];
char errbuf[1024];
dmu_replay_record_t drre;
int error;
boolean_t anyerr = B_FALSE;
boolean_t softerr = B_FALSE;
+ boolean_t recursive;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot receive"));
- if (strchr(destname, '@')) {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "can not specify snapshot name for multi-snapshot stream"));
- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
- }
-
assert(drr->drr_type == DRR_BEGIN);
assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
- assert(drr->drr_u.drr_begin.drr_version == DMU_BACKUP_HEADER_VERSION);
+ assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM);
/*
* Read in the nvlist from the stream.
*/
if (drr->drr_payloadlen != 0) {
- if (!flags.isprefix) {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "must use -d to receive replication "
- "(send -R) stream"));
- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
- }
-
error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
&stream_nv, flags.byteswap, zc);
if (error) {
@@ -1406,6 +2078,16 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
}
}
+ recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+ ENOENT);
+
+ if (recursive && strchr(destname, '@')) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "cannot specify snapshot name for multi-snapshot stream"));
+ error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
+ goto out;
+ }
+
/*
* Read in the end record and verify checksum.
*/
@@ -1449,21 +2131,73 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
}
if (fromsnap != NULL) {
+ nvlist_t *renamed = NULL;
+ nvpair_t *pair = NULL;
+
(void) strlcpy(tofs, destname, ZFS_MAXNAMELEN);
if (flags.isprefix) {
- int i = strcspn(drr->drr_u.drr_begin.drr_toname,
- "/@");
+ struct drr_begin *drrb = &drr->drr_u.drr_begin;
+ int i;
+
+ if (flags.istail) {
+ cp = strrchr(drrb->drr_toname, '/');
+ if (cp == NULL) {
+ (void) strlcat(tofs, "/",
+ ZFS_MAXNAMELEN);
+ i = 0;
+ } else {
+ i = (cp - drrb->drr_toname);
+ }
+ } else {
+ i = strcspn(drrb->drr_toname, "/@");
+ }
/* zfs_receive_one() will create_parents() */
- (void) strlcat(tofs,
- &drr->drr_u.drr_begin.drr_toname[i],
+ (void) strlcat(tofs, &drrb->drr_toname[i],
ZFS_MAXNAMELEN);
*strchr(tofs, '@') = '\0';
}
- softerr = recv_incremental_replication(hdl, tofs,
- flags, stream_nv, stream_avl);
+
+ if (recursive && !flags.dryrun && !flags.nomount) {
+ VERIFY(0 == nvlist_alloc(&renamed,
+ NV_UNIQUE_NAME, 0));
+ }
+
+ softerr = recv_incremental_replication(hdl, tofs, flags,
+ stream_nv, stream_avl, renamed);
+
+ /* Unmount renamed filesystems before receiving. */
+ while ((pair = nvlist_next_nvpair(renamed,
+ pair)) != NULL) {
+ zfs_handle_t *zhp;
+ prop_changelist_t *clp = NULL;
+
+ zhp = zfs_open(hdl, nvpair_name(pair),
+ ZFS_TYPE_FILESYSTEM);
+ if (zhp != NULL) {
+ clp = changelist_gather(zhp,
+ ZFS_PROP_MOUNTPOINT, 0, 0);
+ zfs_close(zhp);
+ if (clp != NULL) {
+ softerr |=
+ changelist_prefix(clp);
+ changelist_free(clp);
+ }
+ }
+ }
+
+ nvlist_free(renamed);
}
}
+ /*
+ * Get the fs specified by the first path in the stream (the top level
+ * specified by 'zfs send') and pass it to each invocation of
+ * zfs_receive_one().
+ */
+ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
+ ZFS_MAXNAMELEN);
+ if ((cp = strchr(sendfs, '@')) != NULL)
+ *cp = '\0';
/* Finally, receive each contained stream */
do {
@@ -1475,7 +2209,8 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
* recv_skip() and return 0).
*/
error = zfs_receive_impl(hdl, destname, flags, fd,
- stream_avl, top_zfs);
+ sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
+ action_handlep);
if (error == ENODATA) {
error = 0;
break;
@@ -1489,7 +2224,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
* renames again.
*/
softerr = recv_incremental_replication(hdl, tofs, flags,
- stream_nv, stream_avl);
+ stream_nv, stream_avl, NULL);
}
out:
@@ -1503,11 +2238,28 @@ out:
return (error);
}
+static void
+trunc_prop_errs(int truncated)
+{
+ ASSERT(truncated != 0);
+
+ if (truncated == 1)
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+ "1 more property could not be set\n"));
+ else
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+ "%d more properties could not be set\n"), truncated);
+}
+
static int
recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
{
dmu_replay_record_t *drr;
void *buf = malloc(1<<20);
+ char errbuf[1024];
+
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot receive:"));
/* XXX would be great to use lseek if possible... */
drr = buf;
@@ -1520,7 +2272,11 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
switch (drr->drr_type) {
case DRR_BEGIN:
/* NB: not to be used on v2 stream packages */
- assert(drr->drr_payloadlen == 0);
+ if (drr->drr_payloadlen != 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid substream header"));
+ return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+ }
break;
case DRR_END:
@@ -1546,13 +2302,23 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
(void) recv_read(hdl, fd, buf,
drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
break;
-
+ case DRR_SPILL:
+ if (byteswap) {
+ drr->drr_u.drr_write.drr_length =
+ BSWAP_64(drr->drr_u.drr_spill.drr_length);
+ }
+ (void) recv_read(hdl, fd, buf,
+ drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
+ break;
+ case DRR_WRITE_BYREF:
case DRR_FREEOBJECTS:
case DRR_FREE:
break;
default:
- assert(!"invalid record type");
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid record type"));
+ return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
}
}
@@ -1566,27 +2332,34 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
static int
zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
recvflags_t flags, dmu_replay_record_t *drr,
- dmu_replay_record_t *drr_noswap, avl_tree_t *stream_avl,
- char **top_zfs)
+ dmu_replay_record_t *drr_noswap, const char *sendfs,
+ nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+ uint64_t *action_handlep)
{
zfs_cmd_t zc = { 0 };
time_t begin_time;
- int ioctl_err, ioctl_errno, err, choplen;
+ int ioctl_err, ioctl_errno, err;
char *cp;
struct drr_begin *drrb = &drr->drr_u.drr_begin;
char errbuf[1024];
- char chopprefix[ZFS_MAXNAMELEN];
+ char prop_errbuf[1024];
+ const char *chopprefix;
boolean_t newfs = B_FALSE;
boolean_t stream_wantsnewfs;
uint64_t parent_snapguid = 0;
prop_changelist_t *clp = NULL;
nvlist_t *snapprops_nvlist = NULL;
+ zprop_errflags_t prop_errflags;
+ boolean_t recursive;
begin_time = time(NULL);
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot receive"));
+ recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+ ENOENT);
+
if (stream_avl != NULL) {
char *snapname;
nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
@@ -1617,6 +2390,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
return (-1);
}
+ cp = NULL;
+
/*
* Determine how much of the snapshot name stored in the stream
* we are going to tack on to the name they specified on the
@@ -1625,38 +2400,77 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
* If they specified a snapshot, chop the entire name stored in
* the stream.
*/
- (void) strcpy(chopprefix, drrb->drr_toname);
- if (flags.isprefix) {
+ if (flags.istail) {
+ /*
+ * A filesystem was specified with -e. We want to tack on only
+ * the tail of the sent snapshot path.
+ */
+ if (strchr(tosnap, '@')) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
+ "argument - snapshot not allowed with -e"));
+ return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
+ }
+
+ chopprefix = strrchr(sendfs, '/');
+
+ if (chopprefix == NULL) {
+ /*
+ * The tail is the poolname, so we need to
+ * prepend a path separator.
+ */
+ int len = strlen(drrb->drr_toname);
+ cp = malloc(len + 2);
+ cp[0] = '/';
+ (void) strcpy(&cp[1], drrb->drr_toname);
+ chopprefix = cp;
+ } else {
+ chopprefix = drrb->drr_toname + (chopprefix - sendfs);
+ }
+ } else if (flags.isprefix) {
/*
- * They specified a fs with -d, we want to tack on
- * everything but the pool name stored in the stream
+ * A filesystem was specified with -d. We want to tack on
+ * everything but the first element of the sent snapshot path
+ * (all but the pool name).
*/
if (strchr(tosnap, '@')) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
"argument - snapshot not allowed with -d"));
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
}
- cp = strchr(chopprefix, '/');
- if (cp == NULL)
- cp = strchr(chopprefix, '@');
- *cp = '\0';
+
+ chopprefix = strchr(drrb->drr_toname, '/');
+ if (chopprefix == NULL)
+ chopprefix = strchr(drrb->drr_toname, '@');
} else if (strchr(tosnap, '@') == NULL) {
/*
- * If they specified a filesystem without -d, we want to
- * tack on everything after the fs specified in the
- * first name from the stream.
+ * If a filesystem was specified without -d or -e, we want to
+ * tack on everything after the fs specified by 'zfs send'.
*/
- cp = strchr(chopprefix, '@');
- *cp = '\0';
+ chopprefix = drrb->drr_toname + strlen(sendfs);
+ } else {
+ /* A snapshot was specified as an exact path (no -d or -e). */
+ if (recursive) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "cannot specify snapshot name for multi-snapshot "
+ "stream"));
+ return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+ }
+ chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
}
- choplen = strlen(chopprefix);
+
+ ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
+ ASSERT(chopprefix > drrb->drr_toname);
+ ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
+ ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
+ chopprefix[0] == '\0');
/*
* Determine name of destination snapshot, store in zc_value.
*/
+ (void) strcpy(zc.zc_top_ds, tosnap);
(void) strcpy(zc.zc_value, tosnap);
- (void) strncat(zc.zc_value, drrb->drr_toname+choplen,
- sizeof (zc.zc_value));
+ (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
+ free(cp);
if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
zcmd_free_nvlists(&zc);
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
@@ -1714,7 +2528,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
(void) strcpy(zc.zc_name, zc.zc_value);
*strchr(zc.zc_name, '@') = '\0';
- if (!zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
+ /*
+ * If the exact receive path was specified and this is the
+ * topmost path in the stream, then if the fs does not exist we
+ * should look no further.
+ */
+ if ((flags.isprefix || (*(chopprefix = drrb->drr_toname +
+ strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
+ !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
char snap[ZFS_MAXNAMELEN];
(void) strcpy(snap, strchr(zc.zc_value, '@'));
if (guid_to_name(hdl, tosnap, drrb->drr_fromguid,
@@ -1730,6 +2551,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
zfs_handle_t *zhp;
+
/*
* Destination fs exists. Therefore this should either
* be an incremental, or the stream specifies a new fs
@@ -1737,7 +2559,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
* away (and have therefore specified -F and removed any
* snapshots).
*/
-
if (stream_wantsnewfs) {
if (!flags.force) {
zcmd_free_nvlists(&zc);
@@ -1780,21 +2601,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
/* We can't do online recv in this case */
clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
if (clp == NULL) {
+ zfs_close(zhp);
zcmd_free_nvlists(&zc);
return (-1);
}
if (changelist_prefix(clp) != 0) {
changelist_free(clp);
+ zfs_close(zhp);
zcmd_free_nvlists(&zc);
return (-1);
}
}
- if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME &&
- zvol_remove_link(hdl, zhp->zfs_name) != 0) {
- zfs_close(zhp);
- zcmd_free_nvlists(&zc);
- return (-1);
- }
zfs_close(zhp);
} else {
/*
@@ -1818,7 +2635,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
*/
*cp = '\0';
- if (flags.isprefix && !flags.dryrun &&
+ if (flags.isprefix && !flags.istail && !flags.dryrun &&
create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
zcmd_free_nvlists(&zc);
return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
@@ -1843,21 +2660,61 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
return (recv_skip(hdl, infd, flags.byteswap));
}
+ zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
+ zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
+ zc.zc_cleanup_fd = cleanup_fd;
+ zc.zc_action_handle = *action_handlep;
+
err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
ioctl_errno = errno;
+ prop_errflags = (zprop_errflags_t)zc.zc_obj;
+
+ if (err == 0) {
+ nvlist_t *prop_errors;
+ VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
+ zc.zc_nvlist_dst_size, &prop_errors, 0));
+
+ nvpair_t *prop_err = NULL;
+
+ while ((prop_err = nvlist_next_nvpair(prop_errors,
+ prop_err)) != NULL) {
+ char tbuf[1024];
+ zfs_prop_t prop;
+ int intval;
+
+ prop = zfs_name_to_prop(nvpair_name(prop_err));
+ (void) nvpair_value_int32(prop_err, &intval);
+ if (strcmp(nvpair_name(prop_err),
+ ZPROP_N_MORE_ERRORS) == 0) {
+ trunc_prop_errs(intval);
+ break;
+ } else {
+ (void) snprintf(tbuf, sizeof (tbuf),
+ dgettext(TEXT_DOMAIN,
+ "cannot receive %s property on %s"),
+ nvpair_name(prop_err), zc.zc_name);
+ zfs_setprop_error(hdl, prop, intval, tbuf);
+ }
+ }
+ nvlist_free(prop_errors);
+ }
+
+ zc.zc_nvlist_dst = 0;
+ zc.zc_nvlist_dst_size = 0;
zcmd_free_nvlists(&zc);
if (err == 0 && snapprops_nvlist) {
zfs_cmd_t zc2 = { 0 };
(void) strcpy(zc2.zc_name, zc.zc_value);
+ zc2.zc_cookie = B_TRUE; /* received */
if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
zcmd_free_nvlists(&zc2);
}
}
- if (err && (ioctl_errno == ENOENT || ioctl_errno == ENODEV)) {
+ if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
/*
* It may be that this snapshot already exists,
* in which case we want to consume & ignore it
@@ -1865,7 +2722,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
*/
avl_tree_t *local_avl;
nvlist_t *local_nv, *fs;
- char *cp = strchr(zc.zc_value, '@');
+ cp = strchr(zc.zc_value, '@');
/*
* XXX Do this faster by just iterating over snaps in
@@ -1873,7 +2730,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
* get a strange "does not exist" error message.
*/
*cp = '\0';
- if (gather_nvlist(hdl, zc.zc_value, NULL, NULL,
+ if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
&local_nv, &local_avl) == 0) {
*cp = '@';
fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
@@ -1885,14 +2742,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
(void) printf("snap %s already exists; "
"ignoring\n", zc.zc_value);
}
- ioctl_err = recv_skip(hdl, infd,
+ err = ioctl_err = recv_skip(hdl, infd,
flags.byteswap);
}
}
*cp = '@';
}
-
if (ioctl_err != 0) {
switch (ioctl_errno) {
case ENODEV:
@@ -1931,17 +2787,25 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
"invalid stream (checksum mismatch)"));
(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
break;
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded to receive this stream."));
+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+ break;
+ case EDQUOT:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "destination %s space quota exceeded"), zc.zc_name);
+ (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
+ break;
default:
(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
}
}
/*
- * Mount or recreate the /dev links for the target filesystem
- * (if created, or if we tore them down to do an incremental
- * restore), and the /dev links for the new snapshot (if
- * created). Also mount any children of the target filesystem
- * if we did an incremental receive.
+ * Mount the target filesystem (if created). Also mount any
+ * children of the target filesystem if we did a replication
+ * receive (indicated by stream_avl being non-NULL).
*/
cp = strchr(zc.zc_value, '@');
if (cp && (ioctl_err == 0 || !newfs)) {
@@ -1953,11 +2817,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
if (h != NULL) {
if (h->zfs_type == ZFS_TYPE_VOLUME) {
*cp = '@';
- err = zvol_create_link(hdl, h->zfs_name);
- if (err == 0 && ioctl_err == 0)
- err = zvol_create_link(hdl,
- zc.zc_value);
- } else if (newfs) {
+ } else if (newfs || stream_avl) {
/*
* Track the first/top of hierarchy fs,
* for mounting and sharing later.
@@ -1975,9 +2835,24 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
changelist_free(clp);
}
+ if (prop_errflags & ZPROP_ERR_NOCLEAR) {
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+ "failed to clear unreceived properties on %s"),
+ zc.zc_name);
+ (void) fprintf(stderr, "\n");
+ }
+ if (prop_errflags & ZPROP_ERR_NORESTORE) {
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+ "failed to restore original properties on %s"),
+ zc.zc_name);
+ (void) fprintf(stderr, "\n");
+ }
+
if (err || ioctl_err)
return (-1);
+ *action_handlep = zc.zc_action_handle;
+
if (flags.verbose) {
char buf1[64];
char buf2[64];
@@ -1997,13 +2872,16 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
static int
zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
- int infd, avl_tree_t *stream_avl, char **top_zfs)
+ int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl,
+ char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
{
int err;
dmu_replay_record_t drr, drr_noswap;
struct drr_begin *drrb = &drr.drr_u.drr_begin;
char errbuf[1024];
zio_cksum_t zcksum = { 0 };
+ uint64_t featureflags;
+ int hdrtype;
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot receive"));
@@ -2041,7 +2919,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
drr.drr_type = BSWAP_32(drr.drr_type);
drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
drrb->drr_type = BSWAP_32(drrb->drr_type);
drrb->drr_flags = BSWAP_32(drrb->drr_flags);
@@ -2055,23 +2933,45 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
}
+ featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
+
+ if (!DMU_STREAM_SUPPORTED(featureflags) ||
+ (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "stream has unsupported feature, feature flags = %lx"),
+ featureflags);
+ return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+ }
+
if (strchr(drrb->drr_toname, '@') == NULL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
"stream (bad snapshot name)"));
return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
}
- if (drrb->drr_version == DMU_BACKUP_STREAM_VERSION) {
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
+ char nonpackage_sendfs[ZFS_MAXNAMELEN];
+ if (sendfs == NULL) {
+ /*
+ * We were not called from zfs_receive_package(). Get
+ * the fs specified by 'zfs send'.
+ */
+ char *cp;
+ (void) strlcpy(nonpackage_sendfs,
+ drr.drr_u.drr_begin.drr_toname, ZFS_MAXNAMELEN);
+ if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
+ *cp = '\0';
+ sendfs = nonpackage_sendfs;
+ }
return (zfs_receive_one(hdl, infd, tosnap, flags,
- &drr, &drr_noswap, stream_avl, top_zfs));
- } else if (drrb->drr_version == DMU_BACKUP_HEADER_VERSION) {
- return (zfs_receive_package(hdl, infd, tosnap, flags,
- &drr, &zcksum, top_zfs));
+ &drr, &drr_noswap, sendfs, stream_nv, stream_avl,
+ top_zfs, cleanup_fd, action_handlep));
} else {
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "stream is unsupported version %llu"),
- drrb->drr_version);
- return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+ assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM);
+ return (zfs_receive_package(hdl, infd, tosnap, flags,
+ &drr, &zcksum, top_zfs, cleanup_fd, action_handlep));
}
}
@@ -2087,8 +2987,16 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
{
char *top_zfs = NULL;
int err;
+ int cleanup_fd;
+ uint64_t action_handle = 0;
+
+ cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+ VERIFY(cleanup_fd >= 0);
+
+ err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL,
+ stream_avl, &top_zfs, cleanup_fd, &action_handle);
- err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
+ VERIFY(0 == close(cleanup_fd));
if (err == 0 && !flags.nomount && top_zfs) {
zfs_handle_t *zhp;
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
index c7eb04e74cac..24725ec044ec 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -104,6 +103,13 @@ vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
return (state == VDEV_STATE_OFFLINE);
}
+/* ARGSUSED */
+static int
+vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+{
+ return (state == VDEV_STATE_REMOVED);
+}
+
/*
* Detect if any leaf devices that have seen errors or could not be opened.
*/
@@ -131,7 +137,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
if (find_vdev_problem(child[c], func))
return (B_TRUE);
} else {
- verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
if (func(vs->vs_state, vs->vs_aux,
@@ -166,7 +172,8 @@ check_status(nvlist_t *config, boolean_t isimport)
{
nvlist_t *nvroot;
vdev_stat_t *vs;
- uint_t vsc;
+ pool_scan_stat_t *ps = NULL;
+ uint_t vsc, psc;
uint64_t nerr;
uint64_t version;
uint64_t stateval;
@@ -177,15 +184,24 @@ check_status(nvlist_t *config, boolean_t isimport)
&version) == 0);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&stateval) == 0);
- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+ /*
+ * Currently resilvering a vdev
+ */
+ (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &psc);
+ if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+ ps->pss_state == DSS_SCANNING)
+ return (ZPOOL_STATUS_RESILVERING);
/*
* Pool last accessed by another system.
*/
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
if (hostid != 0 && (unsigned long)hostid != gethostid() &&
stateval == POOL_STATE_ACTIVE)
return (ZPOOL_STATUS_HOSTID_MISMATCH);
@@ -276,10 +292,10 @@ check_status(nvlist_t *config, boolean_t isimport)
return (ZPOOL_STATUS_OFFLINE_DEV);
/*
- * Currently resilvering
+ * Removed device
*/
- if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
- return (ZPOOL_STATUS_RESILVERING);
+ if (find_vdev_problem(nvroot, vdev_removed))
+ return (ZPOOL_STATUS_REMOVED_DEV);
/*
* Outdated, but usable, version
@@ -315,3 +331,68 @@ zpool_import_status(nvlist_t *config, char **msgid)
return (ret);
}
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+ char refcnt[6];
+ char blocks[6], lsize[6], psize[6], dsize[6];
+ char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+ if (dds == NULL || dds->dds_blocks == 0)
+ return;
+
+ if (h == -1)
+ (void) strcpy(refcnt, "Total");
+ else
+ zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+ zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+ zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize));
+ zfs_nicenum(dds->dds_psize, psize, sizeof (psize));
+ zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize));
+ zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+ zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+ zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+ zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ refcnt,
+ blocks, lsize, psize, dsize,
+ ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+ int h;
+
+ (void) printf("\n");
+
+ (void) printf("bucket "
+ " allocated "
+ " referenced \n");
+ (void) printf("______ "
+ "______________________________ "
+ "______________________________\n");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "refcnt",
+ "blocks", "LSIZE", "PSIZE", "DSIZE",
+ "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "------",
+ "------", "-----", "-----", "-----",
+ "------", "-----", "-----", "-----");
+
+ for (h = 0; h < 64; h++)
+ dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+ dump_ddt_stat(dds_total, -1);
+
+ (void) printf("\n");
+}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
index ddd83742e950..01738fbff2a0 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
@@ -19,14 +19,18 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Internal utility routines for the ZFS library.
*/
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/stat.h>
+
#include <errno.h>
#include <fcntl.h>
#include <libintl.h>
@@ -46,6 +50,8 @@
#include "libzfs_impl.h"
#include "zfs_prop.h"
+int aok;
+
int
libzfs_errno(libzfs_handle_t *hdl)
{
@@ -70,7 +76,7 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_BADPROP:
return (dgettext(TEXT_DOMAIN, "invalid property value"));
case EZFS_PROPREADONLY:
- return (dgettext(TEXT_DOMAIN, "read only property"));
+ return (dgettext(TEXT_DOMAIN, "read-only property"));
case EZFS_PROPTYPE:
return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
"datasets of this type"));
@@ -90,12 +96,10 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_BADSTREAM:
return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
case EZFS_DSREADONLY:
- return (dgettext(TEXT_DOMAIN, "dataset is read only"));
+ return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
case EZFS_VOLTOOBIG:
return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
"this system"));
- case EZFS_VOLHASDATA:
- return (dgettext(TEXT_DOMAIN, "volume has data"));
case EZFS_INVALIDNAME:
return (dgettext(TEXT_DOMAIN, "invalid name"));
case EZFS_BADRESTORE:
@@ -138,16 +142,12 @@ libzfs_error_description(libzfs_handle_t *hdl)
return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
case EZFS_SHARESMBFAILED:
return (dgettext(TEXT_DOMAIN, "smb add share failed"));
- case EZFS_ISCSISVCUNAVAIL:
- return (dgettext(TEXT_DOMAIN,
- "iscsitgt service need to be enabled by "
- "a privileged user"));
- case EZFS_DEVLINKS:
- return (dgettext(TEXT_DOMAIN, "failed to create /dev links"));
case EZFS_PERM:
return (dgettext(TEXT_DOMAIN, "permission denied"));
case EZFS_NOSPC:
return (dgettext(TEXT_DOMAIN, "out of space"));
+ case EZFS_FAULT:
+ return (dgettext(TEXT_DOMAIN, "bad address"));
case EZFS_IO:
return (dgettext(TEXT_DOMAIN, "I/O error"));
case EZFS_INTR:
@@ -161,12 +161,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
case EZFS_NOHISTORY:
return (dgettext(TEXT_DOMAIN, "no history available"));
- case EZFS_UNSHAREISCSIFAILED:
- return (dgettext(TEXT_DOMAIN,
- "iscsitgtd failed request to unshare"));
- case EZFS_SHAREISCSIFAILED:
- return (dgettext(TEXT_DOMAIN,
- "iscsitgtd failed request to share"));
case EZFS_POOLPROPS:
return (dgettext(TEXT_DOMAIN, "failed to retrieve "
"pool properties"));
@@ -194,9 +188,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_NODELEGATION:
return (dgettext(TEXT_DOMAIN, "delegated administration is "
"disabled on pool"));
- case EZFS_PERMRDONLY:
- return (dgettext(TEXT_DOMAIN, "snapshot permissions cannot be"
- " modified"));
case EZFS_BADCACHE:
return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
case EZFS_ISL2CACHE:
@@ -213,6 +204,31 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_UNPLAYED_LOGS:
return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
"logs"));
+ case EZFS_REFTAG_RELE:
+ return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
+ case EZFS_REFTAG_HOLD:
+ return (dgettext(TEXT_DOMAIN, "tag already exists on this "
+ "dataset"));
+ case EZFS_TAGTOOLONG:
+ return (dgettext(TEXT_DOMAIN, "tag too long"));
+ case EZFS_PIPEFAILED:
+ return (dgettext(TEXT_DOMAIN, "pipe create failed"));
+ case EZFS_THREADCREATEFAILED:
+ return (dgettext(TEXT_DOMAIN, "thread create failed"));
+ case EZFS_POSTSPLIT_ONLINE:
+ return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
+ "into a new one"));
+ case EZFS_SCRUBBING:
+ return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
+ "use 'zpool scrub -s' to cancel current scrub"));
+ case EZFS_NO_SCRUB:
+ return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
+ case EZFS_DIFF:
+ return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
+ case EZFS_DIFFDATA:
+ return (dgettext(TEXT_DOMAIN, "invalid diff data"));
+ case EZFS_POOLREADONLY:
+ return (dgettext(TEXT_DOMAIN, "pool is read-only"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@@ -301,6 +317,10 @@ zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
zfs_verror(hdl, EZFS_IO, fmt, ap);
return (-1);
+ case EFAULT:
+ zfs_verror(hdl, EZFS_FAULT, fmt, ap);
+ return (-1);
+
case EINTR:
zfs_verror(hdl, EZFS_INTR, fmt, ap);
return (-1);
@@ -357,9 +377,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
break;
case EROFS:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "snapshot permissions cannot be modified"));
- zfs_verror(hdl, EZFS_PERMRDONLY, fmt, ap);
+ zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
break;
case ENAMETOOLONG:
zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
@@ -373,7 +391,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
default:
- zfs_error_aux(hdl, strerror(errno));
+ zfs_error_aux(hdl, strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
break;
}
@@ -445,12 +463,17 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case EDQUOT:
zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
return (-1);
+
case EAGAIN:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool I/O is currently suspended"));
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
+ case EROFS:
+ zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
+ break;
+
default:
zfs_error_aux(hdl, strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
@@ -484,6 +507,29 @@ zfs_alloc(libzfs_handle_t *hdl, size_t size)
}
/*
+ * A safe form of asprintf() which will die if the allocation fails.
+ */
+/*PRINTFLIKE2*/
+char *
+zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
+{
+ va_list ap;
+ char *ret;
+ int err;
+
+ va_start(ap, fmt);
+
+ err = vasprintf(&ret, fmt, ap);
+
+ va_end(ap);
+
+ if (err < 0)
+ (void) no_memory(hdl);
+
+ return (ret);
+}
+
+/*
* A safe form of realloc(), which also zeroes newly allocated space.
*/
void *
@@ -573,7 +619,7 @@ libzfs_load(void)
/* Not present in kernel, try loading it. */
if (kldload("zfs") < 0 || modfind("zfs") < 0) {
if (errno != EEXIST)
- return (error);
+ return (-1);
}
}
return (0);
@@ -584,17 +630,18 @@ libzfs_init(void)
{
libzfs_handle_t *hdl;
- if ((hdl = calloc(sizeof (libzfs_handle_t), 1)) == NULL) {
+ if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
+ return (NULL);
+ }
+
+ if (libzfs_load() < 0) {
+ free(hdl);
return (NULL);
}
if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
- if (libzfs_load() == 0)
- hdl->libzfs_fd = open(ZFS_DEV, O_RDWR);
- if (hdl->libzfs_fd < 0) {
- free(hdl);
- return (NULL);
- }
+ free(hdl);
+ return (NULL);
}
if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) {
@@ -624,6 +671,9 @@ libzfs_fini(libzfs_handle_t *hdl)
if (hdl->libzfs_log_str)
(void) free(hdl->libzfs_log_str);
zpool_free_handles(hdl);
+#ifdef sun
+ libzfs_fru_clear(hdl, B_TRUE);
+#endif
namespace_clear(hdl);
libzfs_mnttab_fini(hdl);
free(hdl);
@@ -656,7 +706,9 @@ zfs_get_pool_handle(const zfs_handle_t *zhp)
zfs_handle_t *
zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype)
{
- struct statfs statbuf;
+ struct stat64 statbuf;
+ struct extmnttab entry;
+ int ret;
if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
/*
@@ -665,18 +717,42 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype)
return (zfs_open(hdl, path, argtype));
}
- if (statfs(path, &statbuf) != 0) {
+ if (stat64(path, &statbuf) != 0) {
(void) fprintf(stderr, "%s: %s\n", path, strerror(errno));
return (NULL);
}
- if (strcmp(statbuf.f_fstypename, MNTTYPE_ZFS) != 0) {
+#ifdef sun
+ rewind(hdl->libzfs_mnttab);
+ while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) {
+ if (makedevice(entry.mnt_major, entry.mnt_minor) ==
+ statbuf.st_dev) {
+ break;
+ }
+ }
+#else
+ {
+ struct statfs sfs;
+
+ if (statfs(path, &sfs) != 0) {
+ (void) fprintf(stderr, "%s: %s\n", path,
+ strerror(errno));
+ ret = -1;
+ }
+ statfs2mnttab(&sfs, &entry);
+ }
+#endif /* sun */
+ if (ret != 0) {
+ return (NULL);
+ }
+
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
path);
return (NULL);
}
- return (zfs_open(hdl, statbuf.f_mntfromname, ZFS_TYPE_FILESYSTEM));
+ return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
}
/*
@@ -687,7 +763,7 @@ int
zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
{
if (len == 0)
- len = 2048;
+ len = 16 * 1024;
zc->zc_nvlist_dst_size = len;
if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0)
@@ -813,6 +889,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
"PROPERTY"));
cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
"VALUE"));
+ cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
+ "RECEIVED"));
cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
"SOURCE"));
@@ -826,7 +904,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
* inheriting from the longest name. This is acceptable because in the
* majority of cases 'SOURCE' is the last column displayed, and we don't
* use the width anyway. Note that the 'VALUE' column can be oversized,
- * if the name of the property is much longer the any values we find.
+ * if the name of the property is much longer than any values we find.
*/
for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
/*
@@ -857,6 +935,11 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
+ /* 'RECEIVED' column. */
+ if (pl != cbp->cb_proplist &&
+ pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
+ cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
+
/*
* 'NAME' and 'SOURCE' columns
*/
@@ -872,7 +955,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
/*
* Now go through and print the headers.
*/
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < ZFS_GET_NCOLS; i++) {
switch (cbp->cb_columns[i]) {
case GET_COL_NAME:
title = dgettext(TEXT_DOMAIN, "NAME");
@@ -883,6 +966,9 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
case GET_COL_VALUE:
title = dgettext(TEXT_DOMAIN, "VALUE");
break;
+ case GET_COL_RECVD:
+ title = dgettext(TEXT_DOMAIN, "RECEIVED");
+ break;
case GET_COL_SOURCE:
title = dgettext(TEXT_DOMAIN, "SOURCE");
break;
@@ -891,7 +977,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
}
if (title != NULL) {
- if (i == 3 || cbp->cb_columns[i + 1] == 0)
+ if (i == (ZFS_GET_NCOLS - 1) ||
+ cbp->cb_columns[i + 1] == GET_COL_NONE)
(void) printf("%s", title);
else
(void) printf("%-*s ",
@@ -909,7 +996,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
void
zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
const char *propname, const char *value, zprop_source_t sourcetype,
- const char *source)
+ const char *source, const char *recvd_value)
{
int i;
const char *str;
@@ -924,7 +1011,7 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
if (cbp->cb_first)
zprop_print_headers(cbp, cbp->cb_type);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < ZFS_GET_NCOLS; i++) {
switch (cbp->cb_columns[i]) {
case GET_COL_NAME:
str = name;
@@ -961,14 +1048,21 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
"inherited from %s", source);
str = buf;
break;
+ case ZPROP_SRC_RECEIVED:
+ str = "received";
+ break;
}
break;
+ case GET_COL_RECVD:
+ str = (recvd_value == NULL ? "-" : recvd_value);
+ break;
+
default:
continue;
}
- if (cbp->cb_columns[i + 1] == 0)
+ if (cbp->cb_columns[i + 1] == GET_COL_NONE)
(void) printf("%s", str);
else if (cbp->cb_scripted)
(void) printf("%s\t", str);
@@ -976,7 +1070,6 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
(void) printf("%-*s ",
cbp->cb_colwidths[cbp->cb_columns[i]],
str);
-
}
(void) printf("\n");
@@ -1038,7 +1131,7 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
return (-1);
}
- /* Rely on stroull() to process the numeric portion. */
+ /* Rely on strtoull() to process the numeric portion. */
errno = 0;
*num = strtoull(value, &end, 10);
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
index ca68ca1fc00e..2c0778777653 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -36,20 +35,24 @@
#include <sys/zfs_context.h>
#include <sys/zmod.h>
#include <sys/utsname.h>
+#include <sys/systeminfo.h>
/*
* Emulation of kernel services in userland.
*/
-int hz = 119; /* frequency when using gethrtime() >> 23 for lbolt */
+int aok;
uint64_t physmem;
vnode_t *rootdir = (vnode_t *)0xabcd1234;
-char hw_serial[11];
+char hw_serial[HW_HOSTID_LEN];
struct utsname utsname = {
"userland", "libzpool", "1", "1", "na"
};
+/* this only exists to have its address taken */
+struct proc p0;
+
/*
* =========================================================================
* threads
@@ -137,7 +140,7 @@ mutex_tryenter(kmutex_t *mp)
{
ASSERT(mp->initialized == B_TRUE);
ASSERT(mp->m_owner != (void *)-1UL);
- if (mutex_trylock(&mp->m_lock) == 0) {
+ if (0 == mutex_trylock(&mp->m_lock)) {
ASSERT(mp->m_owner == NULL);
mp->m_owner = curthread;
return (1);
@@ -150,7 +153,7 @@ void
mutex_exit(kmutex_t *mp)
{
ASSERT(mp->initialized == B_TRUE);
- ASSERT(mp->m_owner == curthread);
+ ASSERT(mutex_owner(mp) == curthread);
mp->m_owner = NULL;
VERIFY(mutex_unlock(&mp->m_lock) == 0);
}
@@ -308,9 +311,9 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
struct timeval tv;
clock_t delta;
- abstime += lbolt;
+ abstime += ddi_get_lbolt();
top:
- delta = abstime - lbolt;
+ delta = abstime - ddi_get_lbolt();
if (delta <= 0)
return (-1);
@@ -432,10 +435,7 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
vp->v_fd = fd;
- if (S_ISCHR(st.st_mode))
- ioctl(fd, DIOCGMEDIASIZE, &vp->v_size);
- else
- vp->v_size = st.st_size;
+ vp->v_size = st.st_size;
vp->v_path = spa_strdup(path);
return (0);
@@ -497,6 +497,24 @@ vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td)
umem_free(vp, sizeof (vnode_t));
}
+/*
+ * At a minimum we need to update the size since vdev_reopen()
+ * will no longer call vn_openat().
+ */
+int
+fop_getattr(vnode_t *vp, vattr_t *vap)
+{
+ struct stat64 st;
+
+ if (fstat64(vp->v_fd, &st) == -1) {
+ close(vp->v_fd);
+ return (errno);
+ }
+
+ vap->va_size = st.st_size;
+ return (0);
+}
+
#ifdef ZFS_DEBUG
/*
@@ -811,6 +829,17 @@ ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
return (0);
}
+int
+ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
+{
+ char *end;
+
+ *result = strtoull(str, &end, base);
+ if (*result == 0)
+ return (errno);
+ return (0);
+}
+
/*
* =========================================================================
* kernel emulation setup & teardown
@@ -836,8 +865,8 @@ kernel_init(int mode)
dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
(double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
- snprintf(hw_serial, sizeof (hw_serial), "%lu",
- (unsigned long)gethostid());
+ (void) snprintf(hw_serial, sizeof (hw_serial), "%lu",
+ (mode & FWRITE) ? (unsigned long)gethostid() : 0);
VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
@@ -852,6 +881,8 @@ kernel_fini(void)
{
spa_fini();
+ system_taskq_fini();
+
close(random_fd);
close(urandom_fd);
@@ -942,3 +973,72 @@ ksiddomain_rele(ksiddomain_t *ksid)
spa_strfree(ksid->kd_name);
umem_free(ksid, sizeof (ksiddomain_t));
}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ char *buf;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx) + 1;
+ va_end(adx);
+
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ va_start(adx, fmt);
+ size = vsnprintf(buf, size, fmt, adx);
+ va_end(adx);
+
+ return (buf);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ *minorp = 0;
+ return (0);
+}
+
+/* ARGSUSED */
+void
+zfs_onexit_fd_rele(int fd)
+{
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+ return (0);
+}
+
+#ifdef __FreeBSD__
+/* ARGSUSED */
+int
+zvol_create_minors(const char *name)
+{
+ return (0);
+}
+#endif
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
index 942c836ed319..472cf7be0af9 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_CONTEXT_H
@@ -59,6 +58,7 @@ extern "C" {
#include <time.h>
#include <math.h>
#include <umem.h>
+#include <inttypes.h>
#include <fsshare.h>
#include <sys/note.h>
#include <sys/types.h>
@@ -80,7 +80,9 @@ extern "C" {
#include <sys/u8_textprep.h>
#include <sys/kernel.h>
#include <sys/disk.h>
+#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
#include <machine/atomic.h>
#define ZFS_EXPORTS_PATH "/etc/zfs/exports"
@@ -119,20 +121,27 @@ extern void vpanic(const char *, __va_list);
#define fm_panic panic
+extern int aok;
+
/* This definition is copied from assert.h. */
#if defined(__STDC__)
#if __STDC_VERSION__ - 0 >= 199901L
-#define verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0))
+#define zverify(EX) (void)((EX) || (aok) || \
+ (__assert(#EX, __FILE__, __LINE__), 0))
#else
-#define verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0))
+#define zverify(EX) (void)((EX) || (aok) || \
+ (__assert(#EX, __FILE__, __LINE__), 0))
#endif /* __STDC_VERSION__ - 0 >= 199901L */
#else
-#define verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0))
+#define zverify(EX) (void)((EX) || (aok) || \
+ (_assert("EX", __FILE__, __LINE__), 0))
#endif /* __STDC__ */
-#define VERIFY verify
-#define ASSERT assert
+#define VERIFY zverify
+#define ASSERT zverify
+#undef assert
+#define assert zverify
extern void __assert(const char *, const char *, int);
@@ -143,7 +152,7 @@ extern void __assert(const char *, const char *, int);
#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
const TYPE __left = (TYPE)(LEFT); \
const TYPE __right = (TYPE)(RIGHT); \
- if (!(__left OP __right)) { \
+ if (!(__left OP __right) && (!aok)) { \
char *__buf = alloca(256); \
(void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \
#LEFT, #OP, #RIGHT, \
@@ -209,6 +218,18 @@ typedef struct kthread kthread_t;
#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
zk_thread_create(func, arg)
#define thread_exit() thr_exit(NULL)
+#define thread_join(t) panic("libzpool cannot join threads")
+
+#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS)
+
+/* in libzpool, p0 exists only to have its address taken */
+struct proc {
+ uintptr_t this_is_never_used_dont_dereference_it;
+};
+
+extern struct proc p0;
+
+#define PS_NONE -1
extern kthread_t *zk_thread_create(void (*func)(), void *arg);
@@ -225,8 +246,11 @@ typedef struct kmutex {
} kmutex_t;
#define MUTEX_DEFAULT USYNC_THREAD
-#undef MUTEX_HELD
+#undef MUTEX_HELD
+#undef MUTEX_NOT_HELD
#define MUTEX_HELD(m) ((m)->m_owner == curthread)
+#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
+#define _mutex_held(m) pthread_mutex_isowned_np(m)
/*
* Argh -- we have to get cheesy here because the kernel and userland
@@ -234,6 +258,7 @@ typedef struct kmutex {
*/
//extern int _mutex_init(mutex_t *mp, int type, void *arg);
//extern int _mutex_destroy(mutex_t *mp);
+//extern int _mutex_owned(mutex_t *mp);
#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp))
#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp))
@@ -305,6 +330,7 @@ extern void cv_broadcast(kcondvar_t *cv);
#define KM_PUSHPAGE KM_SLEEP
#define KM_NOSLEEP UMEM_DEFAULT
#define KMC_NODEBUG UMC_NODEBUG
+#define KMC_NOTOUCH 0 /* not needed for userland caches */
#define kmem_alloc(_s, _f) umem_alloc(_s, _f)
#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f)
#define kmem_free(_b, _s) umem_free(_b, _s)
@@ -315,10 +341,21 @@ extern void cv_broadcast(kcondvar_t *cv);
#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b)
#define kmem_debugging() 0
-#define kmem_cache_reap_now(c)
+#define kmem_cache_reap_now(_c) /* nothing */
+#define kmem_cache_set_move(_c, _cb) /* nothing */
+#define POINTER_INVALIDATE(_pp) /* nothing */
+#define POINTER_IS_VALID(_p) 0
typedef umem_cache_t kmem_cache_t;
+typedef enum kmem_cbrc {
+ KMEM_CBRC_YES,
+ KMEM_CBRC_NO,
+ KMEM_CBRC_LATER,
+ KMEM_CBRC_DONT_NEED,
+ KMEM_CBRC_DONT_KNOW
+} kmem_cbrc_t;
+
/*
* Task queues
*/
@@ -329,23 +366,30 @@ typedef void (task_func_t)(void *);
#define TASKQ_PREPOPULATE 0x0001
#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
-#define TASKQ_THREADS_CPU_PCT 0x0008 /* Use dynamic thread scheduling */
+#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */
+#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */
#define TQ_SLEEP KM_SLEEP /* Can block for memory */
#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */
-#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
+#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
+#define TQ_FRONT 0x08 /* Queue in front */
extern taskq_t *system_taskq;
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+#define taskq_create_proc(a, b, c, d, e, p, f) \
+ (taskq_create(a, b, c, d, e, f))
+#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
+ (taskq_create(a, b, maxclsyspri, d, e, f))
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern int taskq_member(taskq_t *, void *);
extern void system_taskq_init(void);
+extern void system_taskq_fini(void);
-#define taskq_dispatch_safe(tq, func, arg, task) \
- taskq_dispatch((tq), (func), (arg), TQ_SLEEP)
+#define taskq_dispatch_safe(tq, func, arg, flags, task) \
+ taskq_dispatch((tq), (func), (arg), (flags))
#define XVA_MAPSIZE 3
#define XVA_MAGIC 0x78766174
@@ -359,6 +403,7 @@ typedef struct vnode {
char *v_path;
} vnode_t;
+#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */
typedef struct xoptattr {
timestruc_t xoa_createtime; /* Create time of file */
@@ -374,6 +419,10 @@ typedef struct xoptattr {
uint8_t xoa_opaque;
uint8_t xoa_av_quarantined;
uint8_t xoa_av_modified;
+ uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+ uint8_t xoa_reparse;
+ uint8_t xoa_offline;
+ uint8_t xoa_sparse;
} xoptattr_t;
typedef struct vattr {
@@ -420,13 +469,15 @@ typedef struct vsecattr {
#define CRCREAT 0
+extern int fop_getattr(vnode_t *vp, vattr_t *vap);
+
#define VOP_CLOSE(vp, f, c, o, cr, ct) 0
#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0
-#define VOP_GETATTR(vp, vap, cr) ((vap)->va_size = (vp)->v_size, 0)
+#define VOP_GETATTR(vp, vap, cr) fop_getattr((vp), (vap));
#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd)
-#define VN_RELE(vp) vn_close(vp, 0, NULL, NULL)
+#define VN_RELE(vp) vn_close(vp, 0, NULL, NULL)
#define VN_RELE_ASYNC(vp, taskq) vn_close(vp, 0, NULL, NULL)
#define vn_lock(vp, type)
@@ -460,13 +511,18 @@ extern vnode_t *rootdir;
/*
* Random stuff
*/
-#define lbolt (gethrtime() >> 23)
-#define lbolt64 (gethrtime() >> 23)
-//#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */
+#define ddi_get_lbolt() (gethrtime() >> 23)
+#define ddi_get_lbolt64() (gethrtime() >> 23)
+#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */
extern void delay(clock_t ticks);
#define gethrestime_sec() time(NULL)
+#define gethrestime(t) \
+ do {\
+ (t)->tv_sec = gethrestime_sec();\
+ (t)->tv_nsec = 0;\
+ } while (0);
#define max_ncpus 64
@@ -475,6 +531,9 @@ extern void delay(clock_t ticks);
#define CPU_SEQID (thr_self() & (max_ncpus - 1))
+#define kcred NULL
+#define CRED() NULL
+
#ifndef ptob
#define ptob(x) ((x) * PAGESIZE)
#endif
@@ -516,14 +575,20 @@ typedef struct callb_cpr {
#define zone_dataset_visible(x, y) (1)
#define INGLOBALZONE(z) (1)
+extern char *kmem_asprintf(const char *fmt, ...);
+#define strfree(str) kmem_free((str), strlen(str)+1)
+
/*
* Hostname information
*/
extern struct utsname utsname;
-extern char hw_serial[];
+extern char hw_serial[]; /* for userland-emulated hostid access */
extern int ddi_strtoul(const char *str, char **nptr, int base,
unsigned long *result);
+extern int ddi_strtoull(const char *str, char **nptr, int base,
+ u_longlong_t *result);
+
/* ZFS Boot Related stuff. */
struct _buf {
@@ -563,7 +628,6 @@ extern zoneid_t getzoneid(void);
#define lbolt (gethrtime() >> 23)
#define lbolt64 (gethrtime() >> 23)
-extern int hz;
extern uint64_t physmem;
#define gethrestime_sec() time(NULL)
@@ -593,6 +657,9 @@ void ksiddomain_rele(ksiddomain_t *);
typedef uint32_t idmap_rid_t;
+#define DDI_SLEEP KM_SLEEP
+#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) (0)
+
#define SX_SYSINIT(name, lock, desc)
#define SYSCTL_DECL(...)
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
index 1a73fe83cc3e..8db5d11c1327 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -49,6 +49,8 @@ struct taskq {
int tq_nalloc;
int tq_minalloc;
int tq_maxalloc;
+ kcondvar_t tq_maxalloc_cv;
+ int tq_maxalloc_wait;
task_t *tq_freelist;
task_t tq_task;
};
@@ -57,26 +59,36 @@ static task_t *
task_alloc(taskq_t *tq, int tqflags)
{
task_t *t;
+ int rv;
- if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
tq->tq_freelist = t->task_next;
} else {
- mutex_exit(&tq->tq_lock);
if (tq->tq_nalloc >= tq->tq_maxalloc) {
- if (!(tqflags & KM_SLEEP)) {
- mutex_enter(&tq->tq_lock);
+ if (!(tqflags & KM_SLEEP))
return (NULL);
- }
+
/*
* We don't want to exceed tq_maxalloc, but we can't
* wait for other tasks to complete (and thus free up
* task structures) without risking deadlock with
* the caller. So, we just delay for one second
- * to throttle the allocation rate.
+ * to throttle the allocation rate. If we have tasks
+ * complete before one second timeout expires then
+ * taskq_ent_free will signal us and we will
+ * immediately retry the allocation.
*/
- delay(hz);
+ tq->tq_maxalloc_wait++;
+ rv = cv_timedwait(&tq->tq_maxalloc_cv,
+ &tq->tq_lock, ddi_get_lbolt() + hz);
+ tq->tq_maxalloc_wait--;
+ if (rv > 0)
+ goto again; /* signaled */
}
+ mutex_exit(&tq->tq_lock);
+
t = kmem_alloc(sizeof (task_t), tqflags);
+
mutex_enter(&tq->tq_lock);
if (t != NULL)
tq->tq_nalloc++;
@@ -96,6 +108,9 @@ task_free(taskq_t *tq, task_t *t)
kmem_free(t, sizeof (task_t));
mutex_enter(&tq->tq_lock);
}
+
+ if (tq->tq_maxalloc_wait)
+ cv_signal(&tq->tq_maxalloc_cv);
}
taskqid_t
@@ -114,8 +129,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
mutex_exit(&tq->tq_lock);
return (0);
}
- t->task_next = &tq->tq_task;
- t->task_prev = tq->tq_task.task_prev;
+ if (tqflags & TQ_FRONT) {
+ t->task_next = tq->tq_task.task_next;
+ t->task_prev = &tq->tq_task;
+ } else {
+ t->task_next = &tq->tq_task;
+ t->task_prev = tq->tq_task.task_prev;
+ }
t->task_next->task_prev = t;
t->task_prev->task_next = t;
t->task_func = func;
@@ -191,6 +211,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
tq->tq_flags = flags | TASKQ_ACTIVE;
tq->tq_active = nthreads;
tq->tq_nthreads = nthreads;
@@ -247,6 +268,7 @@ taskq_destroy(taskq_t *tq)
mutex_destroy(&tq->tq_lock);
cv_destroy(&tq->tq_dispatch_cv);
cv_destroy(&tq->tq_wait_cv);
+ cv_destroy(&tq->tq_maxalloc_cv);
kmem_free(tq, sizeof (taskq_t));
}
@@ -272,3 +294,10 @@ system_taskq_init(void)
system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
}
+
+void
+system_taskq_fini(void)
+{
+ taskq_destroy(system_taskq);
+ system_taskq = NULL; /* defensive */
+}
diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/util.c b/cddl/contrib/opensolaris/lib/libzpool/common/util.c
index 781edb6e8abc..9b99531fd1c5 100644
--- a/cddl/contrib/opensolaris/lib/libzpool/common/util.c
+++ b/cddl/contrib/opensolaris/lib/libzpool/common/util.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -90,7 +89,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
if (is_log)
prefix = "log ";
- if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) != 0)
vs = &v0;
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/__init__.py b/cddl/contrib/opensolaris/lib/pyzfs/common/__init__.py
index f4b0f539542f..76b0998a3e7e 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/__init__.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/__init__.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
"""
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/allow.py b/cddl/contrib/opensolaris/lib/pyzfs/common/allow.py
index d3a03c731868..fa8209f697fd 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/allow.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/allow.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
"""This module implements the "zfs allow" and "zfs unallow" subcommands.
@@ -204,8 +203,8 @@ def args_to_perms(parser, options, who, perms):
perms_subcmd = dict(
create=_("Must also have the 'mount' ability"),
destroy=_("Must also have the 'mount' ability"),
- snapshot=_("Must also have the 'mount' ability"),
- rollback=_("Must also have the 'mount' ability"),
+ snapshot="",
+ rollback="",
clone=_("""Must also have the 'create' ability and 'mount'
\t\t\t\tability in the origin file system"""),
promote=_("""Must also have the 'mount'
@@ -217,6 +216,9 @@ perms_subcmd = dict(
mount=_("Allows mount/umount of ZFS datasets"),
share=_("Allows sharing file systems over NFS or SMB\n\t\t\t\tprotocols"),
send="",
+ hold=_("Allows adding a user hold to a snapshot"),
+ release=_("Allows releasing a user hold which\n\t\t\t\tmight destroy the snapshot"),
+ diff=_("Allows lookup of paths within a dataset,\n\t\t\t\tgiven an object number. Ordinary users need this\n\t\t\t\tin order to use zfs diff"),
)
perms_other = dict(
@@ -265,7 +267,7 @@ def print_perms():
print(fmt % (name, _("property"), ""))
def do_allow():
- """Implementes the "zfs allow" and "zfs unallow" subcommands."""
+ """Implements the "zfs allow" and "zfs unallow" subcommands."""
un = (sys.argv[1] == "unallow")
def usage(msg=None):
@@ -320,7 +322,7 @@ def do_allow():
if sys.argv[2] == "-h":
# hack to make "zfs allow -h" work
usage()
- ds = zfs.dataset.Dataset(sys.argv[2])
+ ds = zfs.dataset.Dataset(sys.argv[2], snaps=False)
p = dict()
for (fs, raw) in ds.get_fsacl().items():
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/dataset.py b/cddl/contrib/opensolaris/lib/pyzfs/common/dataset.py
index b45173e01f2e..26192e4075d2 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/dataset.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/dataset.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
"""Implements the Dataset class, providing methods for manipulating ZFS
@@ -109,7 +108,7 @@ class Dataset(object):
types is an iterable of strings specifying which types
of datasets are permitted. Accepted strings are
- "filesystem" and "volume". Defaults to acceptying all
+ "filesystem" and "volume". Defaults to accepting all
types.
snaps is a boolean specifying if snapshots are acceptable.
@@ -203,3 +202,33 @@ class Dataset(object):
Return a dict("whostr": { "perm" -> None })."""
return zfs.ioctl.get_fsacl(self.name)
+
+ def get_holds(self):
+ """Get the user holds on this Dataset.
+
+ Return a dict("tag": timestamp)."""
+
+ return zfs.ioctl.get_holds(self.name)
+
+def snapshots_fromcmdline(dsnames, recursive):
+ for dsname in dsnames:
+ if not "@" in dsname:
+ raise zfs.util.ZFSError(errno.EINVAL,
+ _("cannot open %s") % dsname,
+ _("operation only applies to snapshots"))
+ try:
+ ds = Dataset(dsname)
+ yield ds
+ except zfs.util.ZFSError, e:
+ if not recursive or e.errno != errno.ENOENT:
+ raise
+ if recursive:
+ (base, snapname) = dsname.split('@')
+ parent = Dataset(base)
+ for child in parent.descendents():
+ try:
+ yield Dataset(child.name + "@" +
+ snapname)
+ except zfs.util.ZFSError, e:
+ if e.errno != errno.ENOENT:
+ raise
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/groupspace.py b/cddl/contrib/opensolaris/lib/pyzfs/common/groupspace.py
index 7db4bf3e0c20..9f380fdb89f1 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/groupspace.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/groupspace.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
import zfs.userspace
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/holds.py b/cddl/contrib/opensolaris/lib/pyzfs/common/holds.py
new file mode 100644
index 000000000000..800e28f974dd
--- /dev/null
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/holds.py
@@ -0,0 +1,75 @@
+#! /usr/bin/python2.6
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+"""This module implements the "zfs holds" subcommand.
+The only public interface is the zfs.holds.do_holds() function."""
+
+import optparse
+import sys
+import errno
+import time
+import zfs.util
+import zfs.dataset
+import zfs.table
+
+_ = zfs.util._
+
+def do_holds():
+ """Implements the "zfs holds" subcommand."""
+ def usage(msg=None):
+ parser.print_help()
+ if msg:
+ print
+ parser.exit("zfs: error: " + msg)
+ else:
+ parser.exit()
+
+ u = _("""holds [-r] <snapshot> ...""")
+
+ parser = optparse.OptionParser(usage=u, prog="zfs")
+
+ parser.add_option("-r", action="store_true", dest="recursive",
+ help=_("list holds recursively"))
+
+ (options, args) = parser.parse_args(sys.argv[2:])
+
+ if len(args) < 1:
+ usage(_("missing snapshot argument"))
+
+ fields = ("name", "tag", "timestamp")
+ rjustfields = ()
+ printing = False
+ gotone = False
+ t = zfs.table.Table(fields, rjustfields)
+ for ds in zfs.dataset.snapshots_fromcmdline(args, options.recursive):
+ gotone = True
+ for tag, tm in ds.get_holds().iteritems():
+ val = {"name": ds.name, "tag": tag,
+ "timestamp": time.ctime(tm)}
+ t.addline(ds.name, val)
+ printing = True
+ if printing:
+ t.printme()
+ elif not gotone:
+ raise zfs.util.ZFSError(errno.ENOENT, _("no matching datasets"))
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/ioctl.c b/cddl/contrib/opensolaris/lib/pyzfs/common/ioctl.c
index 4571147a4876..d1f82a7dc888 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/ioctl.c
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/ioctl.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,8 +29,6 @@
#include <strings.h>
#include <unistd.h>
#include <libnvpair.h>
-#include <idmap.h>
-#include <zone.h>
#include <libintl.h>
#include <libzfs.h>
#include <libzfs_impl.h>
@@ -45,10 +43,6 @@ static int zfsdevfd;
#define _(s) dgettext(TEXT_DOMAIN, s)
-#ifdef sun
-extern int sid_to_id(char *sid, boolean_t user, uid_t *id);
-#endif /* sun */
-
/*PRINTFLIKE1*/
static void
seterr(char *fmt, ...)
@@ -66,7 +60,7 @@ seterr(char *fmt, ...)
static char cmdstr[HIS_MAX_RECORD_LEN];
static int
-ioctl_with_cmdstr(unsigned long ioc, zfs_cmd_t *zc)
+ioctl_with_cmdstr(int ioc, zfs_cmd_t *zc)
{
int err;
@@ -138,8 +132,7 @@ dict2nvl(PyObject *d)
nvlist_t *nvl;
int err;
PyObject *key, *value;
-// int pos = 0;
- Py_ssize_t pos = 0;
+ int pos = 0;
if (!PyDict_Check(d)) {
PyErr_SetObject(PyExc_ValueError, d);
@@ -205,7 +198,7 @@ add_ds_props(zfs_cmd_t *zc, PyObject *nvl)
/* On error, returns NULL but does not set python exception. */
static PyObject *
-ioctl_with_dstnv(unsigned long ioc, zfs_cmd_t *zc)
+ioctl_with_dstnv(int ioc, zfs_cmd_t *zc)
{
int nvsz = 2048;
void *nvbuf;
@@ -236,7 +229,7 @@ again:
static PyObject *
py_next_dataset(PyObject *self, PyObject *args)
{
- unsigned long ioc;
+ int ioc;
uint64_t cookie;
zfs_cmd_t zc = { 0 };
int snaps;
@@ -353,6 +346,25 @@ py_set_fsacl(PyObject *self, PyObject *args)
}
static PyObject *
+py_get_holds(PyObject *self, PyObject *args)
+{
+ zfs_cmd_t zc = { 0 };
+ char *name;
+ PyObject *nvl;
+
+ if (!PyArg_ParseTuple(args, "s", &name))
+ return (NULL);
+
+ (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
+ nvl = ioctl_with_dstnv(ZFS_IOC_GET_HOLDS, &zc);
+ if (nvl == NULL)
+ seterr(_("cannot get holds for %s"), name);
+
+ return (nvl);
+}
+
+static PyObject *
py_userspace_many(PyObject *self, PyObject *args)
{
zfs_cmd_t zc = { 0 };
@@ -440,80 +452,6 @@ py_userspace_upgrade(PyObject *self, PyObject *args)
}
static PyObject *
-py_sid_to_id(PyObject *self, PyObject *args)
-{
-#ifdef sun
- char *sid;
- int err, isuser;
- uid_t id;
-
- if (!PyArg_ParseTuple(args, "si", &sid, &isuser))
- return (NULL);
-
- err = sid_to_id(sid, isuser, &id);
- if (err) {
- PyErr_SetString(PyExc_KeyError, sid);
- return (NULL);
- }
-
- return (Py_BuildValue("I", id));
-#else /* sun */
- return (NULL);
-#endif /* sun */
-}
-
-/*
- * Translate the sid string ("S-1-...") to the user@domain name, if
- * possible. There should be a better way to do this, but for now we
- * just translate to the (possibly ephemeral) uid and then back again.
- */
-static PyObject *
-py_sid_to_name(PyObject *self, PyObject *args)
-{
-#ifdef sun
- char *sid;
- int err, isuser;
- uid_t id;
- char *name, *domain;
- char buf[256];
-
- if (!PyArg_ParseTuple(args, "si", &sid, &isuser))
- return (NULL);
-
- err = sid_to_id(sid, isuser, &id);
- if (err) {
- PyErr_SetString(PyExc_KeyError, sid);
- return (NULL);
- }
-
- if (isuser) {
- err = idmap_getwinnamebyuid(id,
- IDMAP_REQ_FLG_USE_CACHE, &name, &domain);
- } else {
- err = idmap_getwinnamebygid(id,
- IDMAP_REQ_FLG_USE_CACHE, &name, &domain);
- }
- if (err != IDMAP_SUCCESS) {
- PyErr_SetString(PyExc_KeyError, sid);
- return (NULL);
- }
- (void) snprintf(buf, sizeof (buf), "%s@%s", name, domain);
- free(name);
- free(domain);
-
- return (Py_BuildValue("s", buf));
-#else /* sun */
- return(NULL);
-#endif /* sun */
-}
-
-static PyObject *
-py_isglobalzone(PyObject *self, PyObject *args)
-{
- return (Py_BuildValue("i", getzoneid() == GLOBAL_ZONEID));
-}
-
-static PyObject *
py_set_cmdstr(PyObject *self, PyObject *args)
{
char *str;
@@ -584,12 +522,7 @@ static PyMethodDef zfsmethods[] = {
"Get dataset properties."},
{"get_proptable", py_get_proptable, METH_NOARGS,
"Get property table."},
- /* Below are not really zfs-specific: */
- {"sid_to_id", py_sid_to_id, METH_VARARGS, "Map SID to UID/GID."},
- {"sid_to_name", py_sid_to_name, METH_VARARGS,
- "Map SID to name@domain."},
- {"isglobalzone", py_isglobalzone, METH_NOARGS,
- "Determine if this is the global zone."},
+ {"get_holds", py_get_holds, METH_VARARGS, "Get user holds."},
{NULL, NULL, 0, NULL}
};
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/table.py b/cddl/contrib/opensolaris/lib/pyzfs/common/table.py
new file mode 100644
index 000000000000..d2a45a142c29
--- /dev/null
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/table.py
@@ -0,0 +1,70 @@
+#! /usr/bin/python2.6
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+import zfs.util
+
+class Table:
+ __slots__ = "fields", "rjustfields", "maxfieldlen", "lines"
+ __repr__ = zfs.util.default_repr
+
+ def __init__(self, fields, rjustfields=()):
+ # XXX maybe have a defaults, too?
+ self.fields = fields
+ self.rjustfields = rjustfields
+ self.maxfieldlen = dict.fromkeys(fields, 0)
+ self.lines = list()
+
+ def __updatemax(self, k, v):
+ self.maxfieldlen[k] = max(self.maxfieldlen.get(k, None), v)
+
+ def addline(self, sortkey, values):
+ """values is a dict from field name to value"""
+
+ va = list()
+ for f in self.fields:
+ v = str(values[f])
+ va.append(v)
+ self.__updatemax(f, len(v))
+ self.lines.append((sortkey, va))
+
+ def printme(self, headers=True):
+ if headers:
+ d = dict([(f, f.upper()) for f in self.fields])
+ self.addline(None, d)
+
+ self.lines.sort()
+ for (k, va) in self.lines:
+ line = str()
+ for i in range(len(self.fields)):
+ if not headers:
+ line += va[i]
+ line += "\t"
+ else:
+ if self.fields[i] in self.rjustfields:
+ fmt = "%*s "
+ else:
+ fmt = "%-*s "
+ mfl = self.maxfieldlen[self.fields[i]]
+ line += fmt % (mfl, va[i])
+ print(line)
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/unallow.py b/cddl/contrib/opensolaris/lib/pyzfs/common/unallow.py
index 1458dc1328fd..cbdd4dd73f6f 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/unallow.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/unallow.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
import zfs.allow
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/userspace.py b/cddl/contrib/opensolaris/lib/pyzfs/common/userspace.py
index c269d51e1db7..33646bca5b7f 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/userspace.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/userspace.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,21 +19,22 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
"""This module implements the "zfs userspace" and "zfs groupspace" subcommands.
The only public interface is the zfs.userspace.do_userspace() function."""
-import zfs.util
-import zfs.ioctl
-import zfs.dataset
import optparse
import sys
import pwd
import grp
import errno
+import solaris.misc
+import zfs.util
+import zfs.ioctl
+import zfs.dataset
+import zfs.table
_ = zfs.util._
@@ -58,9 +59,6 @@ def skiptype(options, prop):
return True
return False
-def updatemax(d, k, v):
- d[k] = max(d.get(k, None), v)
-
def new_entry(options, isgroup, domain, rid):
"""Return a dict("field": value) for this domain (string) + rid (int)"""
@@ -70,9 +68,9 @@ def new_entry(options, isgroup, domain, rid):
idstr = "%u" % rid
(typename, mapfunc) = {
- (1, 1): ("SMB Group", lambda id: zfs.ioctl.sid_to_name(id, 0)),
+ (1, 1): ("SMB Group", lambda id: solaris.misc.sid_to_name(id, 0)),
(1, 0): ("POSIX Group", lambda id: grp.getgrgid(int(id)).gr_name),
- (0, 1): ("SMB User", lambda id: zfs.ioctl.sid_to_name(id, 1)),
+ (0, 1): ("SMB User", lambda id: solaris.misc.sid_to_name(id, 1)),
(0, 0): ("POSIX User", lambda id: pwd.getpwuid(int(id)).pw_name)
}[isgroup, bool(domain)]
@@ -102,8 +100,8 @@ def new_entry(options, isgroup, domain, rid):
v["quota.sort"] = 0
return v
-def process_one_raw(acct, maxfieldlen, options, prop, elem):
- """Update the acct and maxfieldlen dicts to incorporate the
+def process_one_raw(acct, options, prop, elem):
+ """Update the acct dict to incorporate the
information from this elem from Dataset.userspace(prop)."""
(domain, rid, value) = elem
@@ -111,7 +109,7 @@ def process_one_raw(acct, maxfieldlen, options, prop, elem):
if options.translate and domain:
try:
- rid = zfs.ioctl.sid_to_id("%s-%u" % (domain, rid),
+ rid = solaris.misc.sid_to_id("%s-%u" % (domain, rid),
not isgroup)
domain = None
except KeyError:
@@ -134,10 +132,6 @@ def process_one_raw(acct, maxfieldlen, options, prop, elem):
v[field] = str(value)
else:
v[field] = zfs.util.nicenum(value)
- for k in v.keys():
- # some of the .sort fields are integers, so have no len()
- if isinstance(v[k], str):
- updatemax(maxfieldlen, k, len(v[k]))
def do_userspace():
"""Implements the "zfs userspace" and "zfs groupspace" subcommands."""
@@ -156,7 +150,7 @@ def do_userspace():
defaulttypes = "posixgroup,smbgroup"
fields = ("type", "name", "used", "quota")
- ljustfields = ("type", "name")
+ rjustfields = ("used", "quota")
types = ("all", "posixuser", "smbuser", "posixgroup", "smbgroup")
u = _("%s [-niHp] [-o field[,...]] [-sS field] ... \n") % sys.argv[1]
@@ -209,38 +203,23 @@ def do_userspace():
ds = zfs.dataset.Dataset(dsname, types=("filesystem"))
- if ds.getprop("jailed") and zfs.ioctl.isglobalzone():
+ if ds.getprop("jailed") and solaris.misc.isglobalzone():
options.noname = True
if not ds.getprop("useraccounting"):
print(_("Initializing accounting information on old filesystem, please wait..."))
ds.userspace_upgrade()
- acct = dict()
- maxfieldlen = dict()
-
# gather and process accounting information
+ # Due to -i, we need to keep a dict, so we can potentially add
+ # together the posix ID and SID's usage. Grr.
+ acct = dict()
for prop in props.keys():
if skiptype(options, prop):
continue;
for elem in ds.userspace(prop):
- process_one_raw(acct, maxfieldlen, options, prop, elem)
-
- # print out headers
- if not options.noheaders:
- line = str()
- for field in options.fields:
- # make sure the field header will fit
- updatemax(maxfieldlen, field, len(field))
-
- if field in ljustfields:
- fmt = "%-*s "
- else:
- fmt = "%*s "
- line += fmt % (maxfieldlen[field], field.upper())
- print(line)
-
- # custom sorting func
+ process_one_raw(acct, options, prop, elem)
+
def cmpkey(val):
l = list()
for (opt, field) in options.sortfields:
@@ -261,17 +240,7 @@ def do_userspace():
l.append(n)
return l
- # print out data lines
- for val in sorted(acct.itervalues(), key=cmpkey):
- line = str()
- for field in options.fields:
- if options.noheaders:
- line += val[field]
- line += "\t"
- else:
- if field in ljustfields:
- fmt = "%-*s "
- else:
- fmt = "%*s "
- line += fmt % (maxfieldlen[field], val[field])
- print(line)
+ t = zfs.table.Table(options.fields, rjustfields)
+ for val in acct.itervalues():
+ t.addline(cmpkey(val), val)
+ t.printme(not options.noheaders)
diff --git a/cddl/contrib/opensolaris/lib/pyzfs/common/util.py b/cddl/contrib/opensolaris/lib/pyzfs/common/util.py
index 14d05a8bc12f..a33c6693ee00 100644
--- a/cddl/contrib/opensolaris/lib/pyzfs/common/util.py
+++ b/cddl/contrib/opensolaris/lib/pyzfs/common/util.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python2.4
+#! /usr/bin/python2.6
#
# CDDL HEADER START
#
@@ -19,8 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
#
"""This module provides utility functions for ZFS.
@@ -29,6 +28,7 @@ zfs.util.dev -- a file object of /dev/zfs """
import gettext
import errno
import os
+import solaris.misc
# Note: this module (zfs.util) should not import zfs.ioctl, because that
# would introduce a circular dependency
@@ -37,8 +37,11 @@ errno.ENOTSUP = 48
dev = open("/dev/zfs", "w")
-_ = gettext.translation("SUNW_OST_OSLIB", "/usr/lib/locale",
- fallback=True).gettext
+try:
+ _ = gettext.translation("SUNW_OST_OSLIB", "/usr/lib/locale",
+ fallback=True).gettext
+except:
+ _ = solaris.misc.gettext
def default_repr(self):
"""A simple __repr__ function."""
diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile
index 3023a1de43b4..2235a2ee5741 100644
--- a/cddl/lib/libzfs/Makefile
+++ b/cddl/lib/libzfs/Makefile
@@ -6,8 +6,8 @@
.PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common
LIB= zfs
-DPADD= ${LIBUTIL}
-LDADD= -lutil
+DPADD= ${LIBMD} ${LIBPTHREAD} ${LIBUMEM} ${LIBUTIL}
+LDADD= -lmd -lpthread -lumem -lutil
SRCS= deviceid.c \
fsshare.c \
@@ -16,23 +16,28 @@ SRCS= deviceid.c \
zmount.c \
zone.c
-SRCS+= zfs_deleg.c \
- zfs_namecheck.c \
- zfs_prop.c \
- zpool_prop.c \
- zprop_common.c \
+SRCS+= libzfs_changelist.c \
+ libzfs_config.c \
libzfs_dataset.c \
- libzfs_util.c \
+ libzfs_diff.c \
libzfs_graph.c \
+ libzfs_import.c \
libzfs_mount.c \
libzfs_pool.c \
- libzfs_changelist.c \
- libzfs_config.c \
- libzfs_import.c \
libzfs_sendrecv.c \
- libzfs_status.c
+ libzfs_status.c \
+ libzfs_util.c \
+ zfs_comutil.c \
+ zfs_deleg.c \
+ zfs_fletcher.c \
+ zfs_ioctl_compat.c \
+ zfs_namecheck.c \
+ zfs_prop.c \
+ zpool_prop.c \
+ zprop_common.c \
WARNS?= 0
+CSTD= c99
CFLAGS+= -DZFS_NO_ACL
CFLAGS+= -I${.CURDIR}/../../../sbin/mount
CFLAGS+= -I${.CURDIR}/../../../cddl/lib/libumem
diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile
index 7e2841ab5203..0ff8c0d95e6d 100644
--- a/cddl/lib/libzpool/Makefile
+++ b/cddl/lib/libzpool/Makefile
@@ -11,7 +11,7 @@
# LIST_SRCS
.PATH: ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/os
# ATOMIC_SRCS
-.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "ia64" || ${MACHINE_ARCH} == "sparc64" || ${MACHINE_ARCH} == "powerpc64"
+.if exists(${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S)
.PATH: ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}
ATOMIC_SRCS= opensolaris_atomic.S
.if ${MACHINE_ARCH} != "ia64" && ${MACHINE_ARCH} != "sparc64"
@@ -38,16 +38,16 @@ SRCS= ${ZFS_COMMON_SRCS} ${ZFS_SHARED_SRCS} \
WARNS?= 0
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
-CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/include
-CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
+CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include
+CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/head
-CFLAGS+= -I${.CURDIR}/../../../cddl/lib/libumem
-CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/head
+CFLAGS+= -I${.CURDIR}/../../lib/libumem
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair
# XXX: pthread doesn't have mutex_owned() equivalent, so we need to look
# into libthr private structures. That's sooo evil, but it's only for
# ZFS debugging tools needs.
@@ -56,8 +56,8 @@ CFLAGS+= -I${.CURDIR}/../../../lib/libpthread/thread
CFLAGS+= -I${.CURDIR}/../../../lib/libpthread/sys
CFLAGS+= -I${.CURDIR}/../../../lib/libthr/arch/${MACHINE_CPUARCH}/include
-DPADD= ${LIBPTHREAD} ${LIBZ}
-LDADD= -lpthread -lz
+DPADD= ${LIBMD} ${LIBPTHREAD} ${LIBZ}
+LDADD= -lmd -lpthread -lz
# atomic.S doesn't like profiling.
NO_PROFILE=
diff --git a/cddl/sbin/zfs/Makefile b/cddl/sbin/zfs/Makefile
index 591ef06be5a8..11f6a0f13c58 100644
--- a/cddl/sbin/zfs/Makefile
+++ b/cddl/sbin/zfs/Makefile
@@ -19,10 +19,10 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
-DPADD= ${LIBZFS} ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} \
- ${LIBM} ${LIBNVPAIR} ${LIBUUTIL} ${LIBUTIL}
-LDADD= -lzfs -lgeom -lbsdxml -lsbuf \
- -lm -lnvpair -luutil -lutil
+DPADD= ${LIBBSDXML} ${LIBGEOM} ${LIBM} ${LIBNVPAIR} ${LIBSBUF} ${LIBUMEM} \
+ ${LIBUTIL} ${LIBUUTIL} ${LIBZFS}
+LDADD= -lbsdxml -lgeom -lm -lnvpair -lsbuf -lumem -lutil -luutil -lzfs
.include <bsd.prog.mk>
diff --git a/cddl/sbin/zpool/Makefile b/cddl/sbin/zpool/Makefile
index 06fd238fabfa..f810ee163e07 100644
--- a/cddl/sbin/zpool/Makefile
+++ b/cddl/sbin/zpool/Makefile
@@ -1,11 +1,13 @@
# $FreeBSD$
-.PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/cmd/zpool \
- ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
+.PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/cmd/zpool
+.PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/cmd/stat/common
+.PATH: ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
PROG= zpool
MAN= zpool.8
SRCS= zpool_main.c zpool_vdev.c zpool_iter.c zpool_util.c zfs_comutil.c
+SRCS+= timestamp.c
WARNS?= 0
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
@@ -21,10 +23,11 @@ CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
+CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
+CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/cmd/stat/common
-DPADD= ${LIBAVL} ${LIBZFS} ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} \
- ${LIBM} ${LIBNVPAIR} ${LIBUUTIL} ${LIBUTIL}
-LDADD= -lavl -lzfs -lgeom -lbsdxml -lsbuf \
- -lm -lnvpair -luutil -lutil
+DPADD= ${LIBAVL} ${LIBBSDXML} ${LIBGEOM} ${LIBM} ${LIBNVPAIR} ${LIBSBUF} \
+ ${LIBUMEM} ${LIBUTIL} ${LIBUUTIL} ${LIBZFS}
+LDADD= -lavl -lbsdxml -lgeom -lm -lnvpair -lsbuf -lumem -lutil -luutil -lzfs
.include <bsd.prog.mk>
diff --git a/cddl/usr.bin/Makefile b/cddl/usr.bin/Makefile
index c6b1341041d0..13d3a86232cb 100644
--- a/cddl/usr.bin/Makefile
+++ b/cddl/usr.bin/Makefile
@@ -8,12 +8,16 @@ SUBDIR= \
ctfmerge \
sgsmsg \
${_zinject} \
+ ${_zlook} \
+ ${_zstreamdump} \
${_ztest}
.if ${MK_ZFS} != "no"
_zinject= zinject
+#_zlook= zlook
.if ${MK_LIBTHR} != "no"
_ztest= ztest
+_zstreamdump = zstreamdump
.endif
.endif
diff --git a/cddl/usr.bin/zlook/Makefile b/cddl/usr.bin/zlook/Makefile
new file mode 100644
index 000000000000..0251f57bca63
--- /dev/null
+++ b/cddl/usr.bin/zlook/Makefile
@@ -0,0 +1,25 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../contrib/opensolaris/cmd/zlook
+
+PROG= zlook
+NO_MAN=
+
+WARNS?= 0
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
+#CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include
+#CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem
+#CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzfs/common
+#CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common
+#CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair
+#CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
+#CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
+#CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/head
+#CFLAGS+= -I${.CURDIR}/../../lib/libumem
+#
+#DPADD= ${LIBAVL} ${LIBGEOM} ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBUUTIL} \
+# ${LIBZFS} ${LIBZPOOL}
+#LDADD= -lavl -lgeom -lm -lnvpair -lumem -luutil -lzfs -lzpool
+
+.include <bsd.prog.mk>
diff --git a/cddl/usr.bin/zstreamdump/Makefile b/cddl/usr.bin/zstreamdump/Makefile
new file mode 100644
index 000000000000..304ff7ce71ea
--- /dev/null
+++ b/cddl/usr.bin/zstreamdump/Makefile
@@ -0,0 +1,27 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../..//contrib/opensolaris/cmd/zstreamdump
+
+PROG= zstreamdump
+MAN= zstreamdump.1
+
+WARNS?= 0
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
+CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include
+CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/head
+CFLAGS+= -I${.CURDIR}/../../lib/libumem
+
+DPADD= ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBZPOOL} \
+ ${LIBPTHREAD} ${LIBZ} ${LIBAVL}
+LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl
+
+CSTD= c99
+
+.include <bsd.prog.mk>
diff --git a/cddl/usr.bin/ztest/Makefile b/cddl/usr.bin/ztest/Makefile
index 8bb69b1658f3..979880cb01fb 100644
--- a/cddl/usr.bin/ztest/Makefile
+++ b/cddl/usr.bin/ztest/Makefile
@@ -10,6 +10,7 @@ CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include
CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem
CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common
+CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
diff --git a/cddl/usr.sbin/zdb/Makefile b/cddl/usr.sbin/zdb/Makefile
index b98038ed819f..446d1c49b90b 100644
--- a/cddl/usr.sbin/zdb/Makefile
+++ b/cddl/usr.sbin/zdb/Makefile
@@ -19,6 +19,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys
+CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/head
CFLAGS+= -I${.CURDIR}/../../lib/libumem
diff --git a/rescue/rescue/Makefile b/rescue/rescue/Makefile
index bc7ef13af1f3..81d230d7abd1 100644
--- a/rescue/rescue/Makefile
+++ b/rescue/rescue/Makefile
@@ -123,7 +123,7 @@ CRUNCH_LIBS+= -lalias -lcam -lcurses -ldevstat -lipsec
CRUNCH_LIBS+= -lipx
.endif
.if ${MK_ZFS} != "no"
-CRUNCH_LIBS+= -lzfs -lnvpair -luutil -lavl
+CRUNCH_LIBS+= -lavl -lnvpair -lpthread -lzfs -luutil -lumem
.endif
CRUNCH_LIBS+= -lgeom -lbsdxml -ljail -lkiconv -lmd -lreadline -lsbuf -lufs -lz
diff --git a/sys/boot/i386/gptzfsboot/Makefile b/sys/boot/i386/gptzfsboot/Makefile
index 0604332fa32a..2d1ace506698 100644
--- a/sys/boot/i386/gptzfsboot/Makefile
+++ b/sys/boot/i386/gptzfsboot/Makefile
@@ -33,6 +33,7 @@ CFLAGS= -DBOOTPROG=\"gptzfsboot\" \
-I${.CURDIR}/../../../cddl/boot/zfs \
-I${.CURDIR}/../btx/lib -I. \
-I${.CURDIR}/../boot2 \
+ -I${.CURDIR}/../../.. \
-Wall -Waggregate-return -Wbad-function-cast -Wcast-align \
-Wmissing-declarations -Wmissing-prototypes -Wnested-externs \
-Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \
diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile
index 876b0c062f9e..3aafb4350bb3 100644
--- a/sys/boot/i386/zfsboot/Makefile
+++ b/sys/boot/i386/zfsboot/Makefile
@@ -69,7 +69,7 @@ CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \
# We currently allow 32768 bytes for zfsboot - in practice it could be
# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr.
#
-BOOT2SIZE= 32768
+BOOT2SIZE= 65536
zfsboot2: zfsboot.ld
@set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c
index a995f574f7ba..e313fdea380b 100644
--- a/sys/boot/zfs/zfs.c
+++ b/sys/boot/zfs/zfs.c
@@ -144,13 +144,16 @@ zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */)
{
spa_t *spa = (spa_t *) f->f_devdata;
struct file *fp = (struct file *)f->f_fsdata;
- const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus;
+ struct stat sb;
size_t n;
int rc;
+ rc = zfs_stat(f, &sb);
+ if (rc)
+ return (rc);
n = size;
- if (fp->f_seekp + n > zp->zp_size)
- n = zp->zp_size - fp->f_seekp;
+ if (fp->f_seekp + n > sb.st_size)
+ n = sb.st_size - fp->f_seekp;
rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
if (rc)
@@ -182,7 +185,6 @@ static off_t
zfs_seek(struct open_file *f, off_t offset, int where)
{
struct file *fp = (struct file *)f->f_fsdata;
- znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
switch (where) {
case SEEK_SET:
@@ -192,8 +194,18 @@ zfs_seek(struct open_file *f, off_t offset, int where)
fp->f_seekp += offset;
break;
case SEEK_END:
- fp->f_seekp = zp->zp_size - offset;
+ {
+ struct stat sb;
+ int error;
+
+ error = zfs_stat(f, &sb);
+ if (error != 0) {
+ errno = error;
+ return (-1);
+ }
+ fp->f_seekp = sb.st_size - offset;
break;
+ }
default:
errno = EINVAL;
return (-1);
@@ -204,16 +216,10 @@ zfs_seek(struct open_file *f, off_t offset, int where)
static int
zfs_stat(struct open_file *f, struct stat *sb)
{
+ spa_t *spa = (spa_t *) f->f_devdata;
struct file *fp = (struct file *)f->f_fsdata;
- znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
-
- /* only important stuff */
- sb->st_mode = zp->zp_mode;
- sb->st_uid = zp->zp_uid;
- sb->st_gid = zp->zp_gid;
- sb->st_size = zp->zp_size;
- return (0);
+ return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
}
static int
@@ -221,14 +227,16 @@ zfs_readdir(struct open_file *f, struct dirent *d)
{
spa_t *spa = (spa_t *) f->f_devdata;
struct file *fp = (struct file *)f->f_fsdata;
- znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
mzap_ent_phys_t mze;
+ struct stat sb;
size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
int rc;
- if ((zp->zp_mode >> 12) != 0x4) {
+ rc = zfs_stat(f, &sb);
+ if (rc)
+ return (rc);
+ if (!S_ISDIR(sb.st_mode))
return (ENOTDIR);
- }
/*
* If this is the first read, get the zap type.
diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c
index cb0912099c12..497667a8e0ef 100644
--- a/sys/boot/zfs/zfsimpl.c
+++ b/sys/boot/zfs/zfsimpl.c
@@ -31,6 +31,8 @@ __FBSDID("$FreeBSD$");
* Stand-alone ZFS file reader.
*/
+#include <sys/stat.h>
+
#include "zfsimpl.h"
#include "zfssubr.c"
@@ -70,26 +72,30 @@ zfs_init(void)
zfs_init_crc();
}
-static char *
-zfs_alloc_temp(size_t sz)
+static void *
+zfs_alloc(size_t size)
{
- char *p;
+ char *ptr;
- if (zfs_temp_ptr + sz > zfs_temp_end) {
+ if (zfs_temp_ptr + size > zfs_temp_end) {
printf("ZFS: out of temporary buffer space\n");
for (;;) ;
}
- p = zfs_temp_ptr;
- zfs_temp_ptr += sz;
+ ptr = zfs_temp_ptr;
+ zfs_temp_ptr += size;
- return (p);
+ return (ptr);
}
static void
-zfs_reset_temp(void)
+zfs_free(void *ptr, size_t size)
{
- zfs_temp_ptr = zfs_temp_buf;
+ zfs_temp_ptr -= size;
+ if (zfs_temp_ptr != ptr) {
+ printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
+ for (;;) ;
+ }
}
static int
@@ -341,7 +347,7 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
- if (bp && zio_checksum_error(bp, buf))
+ if (bp && zio_checksum_error(bp, buf, offset))
return (EIO);
return (0);
@@ -428,7 +434,8 @@ vdev_create(uint64_t guid, vdev_read_t *read)
}
static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
+ vdev_t **vdevp, int is_newer)
{
int rc;
uint64_t guid, id, ashift, nparity;
@@ -453,7 +460,7 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
&& strcmp(type, VDEV_TYPE_DISK)
&& strcmp(type, VDEV_TYPE_RAIDZ)
&& strcmp(type, VDEV_TYPE_REPLACING)) {
- printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
+ printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
return (EIO);
}
@@ -484,6 +491,7 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
vdev = vdev_create(guid, vdev_disk_read);
vdev->v_id = id;
+ vdev->v_top = pvdev != NULL ? pvdev : vdev;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
DATA_TYPE_UINT64, 0, &ashift) == 0)
vdev->v_ashift = ashift;
@@ -503,8 +511,14 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
if (!strcmp(type, "raidz")) {
if (vdev->v_nparity == 1)
vdev->v_name = "raidz1";
- else
+ else if (vdev->v_nparity == 2)
vdev->v_name = "raidz2";
+ else if (vdev->v_nparity == 3)
+ vdev->v_name = "raidz3";
+ else {
+ printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
+ return (EIO);
+ }
} else {
vdev->v_name = strdup(type);
}
@@ -541,7 +555,7 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
if (rc == 0) {
vdev->v_nchildren = nkids;
for (i = 0; i < nkids; i++) {
- rc = vdev_init_from_nvlist(kids, &kid, is_newer);
+ rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
if (rc)
return (rc);
if (is_new)
@@ -770,7 +784,7 @@ vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
const char *pool_name;
const unsigned char *vdevs;
int i, rc, is_newer;
- char upbuf[1024];
+ char *upbuf;
const struct uberblock *up;
/*
@@ -814,17 +828,10 @@ vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
return (EIO);
}
-#ifndef TEST
- if (val != POOL_STATE_ACTIVE) {
- /*
- * Don't print a message here. If we happen to reboot
- * while where is an exported pool around, we don't
- * need a cascade of confusing messages during boot.
- */
- /*printf("ZFS: pool is not active\n");*/
+ if (val == POOL_STATE_DESTROYED) {
+ /* We don't boot only from destroyed pools. */
return (EIO);
}
-#endif
if (nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_TXG,
@@ -884,7 +891,7 @@ vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
return (EIO);
}
- rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
+ rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
if (rc)
return (rc);
@@ -920,22 +927,23 @@ vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
* the best uberblock and then we can actually access
* the contents of the pool.
*/
+ upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
+ up = (const struct uberblock *)upbuf;
for (i = 0;
- i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
+ i < VDEV_UBERBLOCK_COUNT(vdev);
i++) {
- off = offsetof(vdev_label_t, vl_uberblock);
- off += i << UBERBLOCK_SHIFT;
+ off = VDEV_UBERBLOCK_OFFSET(vdev, i);
BP_ZERO(&bp);
DVA_SET_OFFSET(&bp.blk_dva[0], off);
- BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
- BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+ BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
+ BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
- if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
+
+ if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev)))
continue;
- up = (const struct uberblock *) upbuf;
if (up->ub_magic != UBERBLOCK_MAGIC)
continue;
if (up->ub_txg < spa->spa_txg)
@@ -947,6 +955,7 @@ vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
spa->spa_uberblock = *up;
}
}
+ zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
if (spap)
*spap = spa;
@@ -1000,16 +1009,11 @@ static int
zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
{
int cpfunc = BP_GET_COMPRESS(bp);
- size_t lsize = BP_GET_LSIZE(bp);
- size_t psize = BP_GET_PSIZE(bp);
+ uint64_t align, size;
void *pbuf;
- int i;
+ int i, error;
- zfs_reset_temp();
- if (cpfunc != ZIO_COMPRESS_OFF)
- pbuf = zfs_alloc_temp(psize);
- else
- pbuf = buf;
+ error = EIO;
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
const dva_t *dva = &bp->blk_dva[i];
@@ -1021,32 +1025,49 @@ zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
continue;
if (DVA_GET_GANG(dva)) {
- if (zio_read_gang(spa, bp, dva, buf))
+ error = zio_read_gang(spa, bp, dva, buf);
+ if (error != 0)
continue;
} else {
vdevid = DVA_GET_VDEV(dva);
offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
if (vdev->v_id == vdevid)
break;
- if (!vdev || !vdev->v_read) {
- continue;
}
- if (vdev->v_read(vdev, bp, pbuf, offset, psize))
+ if (!vdev || !vdev->v_read)
continue;
- if (cpfunc != ZIO_COMPRESS_OFF) {
- if (zio_decompress_data(cpfunc, pbuf, psize,
- buf, lsize))
- return (EIO);
+ size = BP_GET_PSIZE(bp);
+ align = 1ULL << vdev->v_top->v_ashift;
+ if (P2PHASE(size, align) != 0)
+ size = P2ROUNDUP(size, align);
+ if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+ pbuf = zfs_alloc(size);
+ else
+ pbuf = buf;
+
+ error = vdev->v_read(vdev, bp, pbuf, offset, size);
+ if (error == 0) {
+ if (cpfunc != ZIO_COMPRESS_OFF) {
+ error = zio_decompress_data(cpfunc,
+ pbuf, BP_GET_PSIZE(bp), buf,
+ BP_GET_LSIZE(bp));
+ } else if (size != BP_GET_PSIZE(bp)) {
+ bcopy(pbuf, buf, BP_GET_PSIZE(bp));
+ }
}
+ if (buf != pbuf)
+ zfs_free(pbuf, size);
+ if (error != 0)
+ continue;
}
-
- return (0);
+ error = 0;
+ break;
}
- printf("ZFS: i/o error - all block copies unavailable\n");
-
- return (EIO);
+ if (error != 0)
+ printf("ZFS: i/o error - all block copies unavailable\n");
+ return (error);
}
static int
@@ -1276,7 +1297,7 @@ zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *va
{
int rc;
uint64_t zap_type;
- size_t size = dnode->dn_datablkszsec * 512;
+ size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
rc = dnode_read(spa, dnode, 0, zap_scratch, size);
if (rc)
@@ -1285,8 +1306,10 @@ zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *va
zap_type = *(uint64_t *) zap_scratch;
if (zap_type == ZBT_MICRO)
return mzap_lookup(spa, dnode, name, value);
- else
+ else if (zap_type == ZBT_HEADER)
return fzap_lookup(spa, dnode, name, value);
+ printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
+ return (EIO);
}
#ifdef BOOT2
@@ -1497,6 +1520,7 @@ zfs_mount_root(spa_t *spa, objset_phys_t *objset)
static int
zfs_mount_pool(spa_t *spa)
{
+
/*
* Find the MOS and work our way in from there.
*/
@@ -1516,6 +1540,58 @@ zfs_mount_pool(spa_t *spa)
return (0);
}
+static int
+zfs_dnode_stat(spa_t *spa, dnode_phys_t *dn, struct stat *sb)
+{
+
+ if (dn->dn_bonustype != DMU_OT_SA) {
+ znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
+
+ sb->st_mode = zp->zp_mode;
+ sb->st_uid = zp->zp_uid;
+ sb->st_gid = zp->zp_gid;
+ sb->st_size = zp->zp_size;
+ } else {
+ sa_hdr_phys_t *sahdrp;
+ int hdrsize;
+ size_t size = 0;
+ void *buf = NULL;
+
+ if (dn->dn_bonuslen != 0)
+ sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
+ else {
+ if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
+ blkptr_t *bp = &dn->dn_spill;
+ int error;
+
+ size = BP_GET_LSIZE(bp);
+ buf = zfs_alloc(size);
+ error = zio_read(spa, bp, buf);
+ if (error != 0) {
+ zfs_free(buf, size);
+ return (error);
+ }
+ sahdrp = buf;
+ } else {
+ return (EIO);
+ }
+ }
+ hdrsize = SA_HDR_SIZE(sahdrp);
+ sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
+ SA_MODE_OFFSET);
+ sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
+ SA_UID_OFFSET);
+ sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
+ SA_GID_OFFSET);
+ sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
+ SA_SIZE_OFFSET);
+ if (buf != NULL)
+ zfs_free(buf, size);
+ }
+
+ return (0);
+}
+
/*
* Lookup a file and return its dnode.
*/
@@ -1525,11 +1601,11 @@ zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
int rc;
uint64_t objnum, rootnum, parentnum;
dnode_phys_t dn;
- const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
const char *p, *q;
char element[256];
char path[1024];
int symlinks_followed = 0;
+ struct stat sb;
if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
printf("ZFS: unexpected object set type %llu\n",
@@ -1569,9 +1645,11 @@ zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
p = 0;
}
- if ((zp->zp_mode >> 12) != 0x4) {
+ rc = zfs_dnode_stat(spa, &dn, &sb);
+ if (rc)
+ return (rc);
+ if (!S_ISDIR(sb.st_mode))
return (ENOTDIR);
- }
parentnum = objnum;
rc = zap_lookup(spa, &dn, element, &objnum);
@@ -1586,7 +1664,10 @@ zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
/*
* Check for symlink.
*/
- if ((zp->zp_mode >> 12) == 0xa) {
+ rc = zfs_dnode_stat(spa, &dn, &sb);
+ if (rc)
+ return (rc);
+ if (S_ISLNK(sb.st_mode)) {
if (symlinks_followed > 10)
return (EMLINK);
symlinks_followed++;
@@ -1596,14 +1677,14 @@ zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
* current path onto the end.
*/
if (p)
- strcpy(&path[zp->zp_size], p);
+ strcpy(&path[sb.st_size], p);
else
- path[zp->zp_size] = 0;
- if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
+ path[sb.st_size] = 0;
+ if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
- zp->zp_size);
+ sb.st_size);
} else {
- rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
+ rc = dnode_read(spa, &dn, 0, path, sb.st_size);
if (rc)
return (rc);
}
diff --git a/sys/cddl/boot/zfs/fletcher.c b/sys/cddl/boot/zfs/fletcher.c
index 2b9728d70484..3c6003607039 100644
--- a/sys/cddl/boot/zfs/fletcher.c
+++ b/sys/cddl/boot/zfs/fletcher.c
@@ -43,6 +43,23 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
}
static void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+static void
fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
const uint32_t *ip = buf;
@@ -58,3 +75,20 @@ fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
}
+
+static void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 34b9a63b72dd..9f24cc02e484 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -95,6 +95,14 @@
BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
/*
+ * Macros to reverse byte order
+ */
+#define BSWAP_8(x) ((x) & 0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+
+/*
* We currently support nine block sizes, from 512 bytes to 128K.
* We could go higher, but the benefits are near-zero and the cost
* of COWing a giant block to modify one byte would become excessive.
@@ -150,15 +158,15 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
+ * 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
+ * a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -182,25 +190,29 @@ typedef struct zio_cksum {
* cksum checksum function
* comp compression function
* G gang block indicator
- * E endianness
- * type DMU object type
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
* lvl level of indirection
- * birth txg transaction group in which the block was born
+ * type DMU object type
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
/*
* Macros to get and set fields in a bp or DVA.
*/
@@ -246,9 +258,15 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
@@ -304,18 +322,41 @@ typedef struct blkptr {
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
+ (bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_eck_t zg_tail;
+} zio_gbh_phys_t;
+
+#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
@@ -324,7 +365,7 @@ typedef struct zio_block_tail {
#define VDEV_UBERBLOCK_RING (128 << 10)
#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+ MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
@@ -332,8 +373,8 @@ typedef struct zio_block_tail {
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
@@ -363,24 +404,6 @@ typedef struct vdev_label {
#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
#define VDEV_LABELS 4
-/*
- * Gang block headers are self-checksumming and contain an array
- * of block pointers.
- */
-#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
-#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
-#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
- (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
- sizeof (uint64_t))
-
-typedef struct zio_gbh {
- blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
- uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
-} zio_gbh_phys_t;
-
enum zio_checksum {
ZIO_CHECKSUM_INHERIT = 0,
ZIO_CHECKSUM_ON,
@@ -391,10 +414,11 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_FUNCTIONS
};
-#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
enum zio_compress {
@@ -412,6 +436,7 @@ enum zio_compress {
ZIO_COMPRESS_GZIP_7,
ZIO_COMPRESS_GZIP_8,
ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
ZIO_COMPRESS_FUNCTIONS
};
@@ -470,13 +495,28 @@ typedef enum {
#define SPA_VERSION_13 13ULL
#define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
+#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
+#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
+#define SPA_VERSION_27 27ULL
+#define SPA_VERSION_28 28ULL
+
/*
- * When bumping up SPA_VERSION, make sure GRUB ZFS understand the on-disk
- * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
- * and do the appropriate changes.
+ * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes. Also bump the version number in
+ * usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_15
-#define SPA_VERSION_STRING "15"
+#define SPA_VERSION SPA_VERSION_28
+#define SPA_VERSION_STRING "28"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -513,6 +553,20 @@ typedef enum {
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
/*
* The following are configuration names used in the nvlist describing a pool's
@@ -558,6 +612,8 @@ typedef enum {
#define ZPOOL_CONFIG_FAULTED "faulted"
#define ZPOOL_CONFIG_DEGRADED "degraded"
#define ZPOOL_CONFIG_REMOVED "removed"
+#define ZPOOL_CONFIG_FRU "fru"
+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
@@ -566,7 +622,10 @@ typedef enum {
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
+#define VDEV_TYPE_LOG "log"
+#define VDEV_TYPE_L2CACHE "l2cache"
/*
* This is needed in userland to report the minimum necessary device size.
@@ -577,11 +636,7 @@ typedef enum {
* The location of the pool configuration repository, shared between kernel and
* userland.
*/
-#define ZPOOL_CACHE_DIR "/boot/zfs"
-#define ZPOOL_CACHE_FILE "zpool.cache"
-#define ZPOOL_CACHE_TMP ".zpool.cache"
-
-#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE
+#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
/*
* vdev states are ordered from least to most healthy.
@@ -694,7 +749,11 @@ struct uberblock {
#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
@@ -716,7 +775,8 @@ typedef struct dnode_phys {
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
} dnode_phys_t;
typedef enum dmu_object_type {
@@ -744,7 +804,7 @@ typedef enum dmu_object_type {
DMU_OT_DSL_DATASET, /* UINT64 */
/* zpl: */
DMU_OT_ZNODE, /* ZNODE */
- DMU_OT_ACL, /* ACL */
+ DMU_OT_OLDACL, /* Old ACL */
DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
DMU_OT_MASTER_NODE, /* ZAP */
@@ -761,7 +821,24 @@ typedef enum dmu_object_type {
DMU_OT_SPA_HISTORY, /* UINT8 */
DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
DMU_OT_POOL_PROPS, /* ZAP */
-
+ DMU_OT_DSL_PERMS, /* ZAP */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_SYSACL, /* SYSACL */
+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
+ DMU_OT_NEXT_CLONES, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -776,6 +853,54 @@ typedef enum dmu_objset_type {
} dmu_objset_type_t;
/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+/*
* Intent log header - this on disk structure holds fields to manage
* the log. All fields are 64 bit to easily handle cross architectures.
*/
@@ -787,12 +912,14 @@ typedef struct zil_header {
uint64_t zh_pad[5];
} zil_header_t;
+#define OBJSET_PHYS_SIZE 2048
+
typedef struct objset_phys {
dnode_phys_t os_meta_dnode;
zil_header_t os_zil_header;
uint64_t os_type;
uint64_t os_flags;
- char os_pad[2048 - sizeof (dnode_phys_t)*3 -
+ char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
sizeof (zil_header_t) - sizeof (uint64_t)*2];
dnode_phys_t os_userused_dnode;
dnode_phys_t os_groupused_dnode;
@@ -1174,11 +1301,12 @@ typedef struct vdev {
STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */
STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */
vdev_list_t v_children; /* children of this vdev */
- char *v_name; /* vdev name */
+ const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
int v_id; /* index in parent */
int v_ashift; /* offset to block shift */
int v_nparity; /* # parity for raidz */
+ struct vdev *v_top; /* parent vdev */
int v_nchildren; /* # children */
vdev_state_t v_state; /* current state */
vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c
index 25d349b1ce42..5022292b995d 100644
--- a/sys/cddl/boot/zfs/zfssubr.c
+++ b/sys/cddl/boot/zfs/zfssubr.c
@@ -28,6 +28,20 @@ __FBSDID("$FreeBSD$");
static uint64_t zfs_crc64_table[256];
+#define ECKSUM 666
+
+#define ASSERT(...) do { } while (0)
+#define ASSERT3U(...) do { } while (0)
+#define ASSERT3S(...) do { } while (0)
+
+#define panic(...) do { \
+ printf(__VA_ARGS__); \
+ for (;;) ; \
+} while (0)
+
+#define kmem_alloc(size, flag) zfs_alloc((size))
+#define kmem_free(ptr, size) zfs_free((ptr), (size))
+
static void
zfs_init_crc(void)
{
@@ -63,7 +77,8 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
typedef struct zio_checksum_info {
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
+ int ci_eck; /* uses zio embedded checksum? */
+ int ci_dedup; /* strong enough for dedup? */
const char *ci_name; /* descriptive name */
} zio_checksum_info_t;
@@ -71,17 +86,19 @@ typedef struct zio_checksum_info {
#include "sha256.c"
static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
- {{zio_checksum_SHA256, NULL}, 1, 1, "label"},
- {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"},
- {{fletcher_2_native, NULL}, 0, 1, "zilog"},
- {{fletcher_2_native, NULL}, 0, 0, "fletcher2"},
- {{fletcher_4_native, NULL}, 1, 0, "fletcher4"},
- {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"},
+ {{NULL, NULL}, 0, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"},
};
+
/*
* Common signature for all zio compress/decompress functions.
*/
@@ -101,6 +118,7 @@ typedef struct zio_compress_info {
} zio_compress_info_t;
#include "lzjb.c"
+#include "zle.c"
/*
* Compression vectors.
@@ -120,33 +138,98 @@ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{NULL, NULL, 7, "gzip-7"},
{NULL, NULL, 8, "gzip-8"},
{NULL, NULL, 9, "gzip-9"},
+ {NULL, zle_decompress, 64, "zle"},
};
+static void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+/*
+ * Set the external verifier for a gang block based on <vdev, offset, txg>,
+ * a tuple which is guaranteed to be unique for the life of the pool.
+ */
+static void
+zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
+{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
+}
+
+/*
+ * Set the external verifier for a label block based on its offset.
+ * The vdev is implicit, and the txg is unknowable at pool open time --
+ * hence the logic in vdev_uberblock_load() to find the most recent copy.
+ */
+static void
+zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
+{
+ ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
+}
+
static int
-zio_checksum_error(const blkptr_t *bp, void *data)
+zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
{
- zio_cksum_t zc = bp->blk_cksum;
- unsigned int checksum = BP_GET_CHECKSUM(bp);
+ unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
uint64_t size = BP_GET_PSIZE(bp);
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t actual_cksum, expected_cksum;
+ zio_checksum_info_t *ci;
+ zio_cksum_t actual_cksum, expected_cksum, verifier;
+ int byteswap;
- if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
+ return (EINVAL);
+ ci = &zio_checksum_table[checksum];
+ if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
return (EINVAL);
- if (ci->ci_zbt) {
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = zc;
- ci->ci_func[0](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
- zc = expected_cksum;
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
+ checksum == ZIO_CHECKSUM_LABEL);
+
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_checksum_gang_verifier(&verifier, bp);
+ else if (checksum == ZIO_CHECKSUM_LABEL)
+ zio_checksum_label_verifier(&verifier, offset);
+ else
+ verifier = bp->blk_cksum;
+
+ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
+
+ if (byteswap)
+ byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+
+ expected_cksum = eck->zec_cksum;
+ eck->zec_cksum = verifier;
+ ci->ci_func[byteswap](data, size, &actual_cksum);
+ eck->zec_cksum = expected_cksum;
+
+ if (byteswap)
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
} else {
- /* ASSERT(!BP_IS_GANG(bp)); */
+ ASSERT(!BP_IS_GANG(bp));
+ expected_cksum = bp->blk_cksum;
ci->ci_func[0](data, size, &actual_cksum);
}
- if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
/*printf("ZFS: read checksum failed\n");*/
return (EIO);
}
@@ -158,14 +241,20 @@ static int
zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
void *dest, uint64_t destsize)
{
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ zio_compress_info_t *ci;
- /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
- if (!ci->ci_decompress) {
+ if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
return (EIO);
}
+ ci = &zio_compress_table[cpfunc];
+ if (!ci->ci_decompress) {
+ printf("ZFS: unsupported compression algorithm %s\n",
+ ci->ci_name);
+ return (EIO);
+ }
+
return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
}
@@ -176,8 +265,8 @@ zap_hash(uint64_t salt, const char *name)
uint8_t c;
uint64_t crc = salt;
- /*ASSERT(crc != 0);*/
- /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
+ ASSERT(crc != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
@@ -192,7 +281,8 @@ zap_hash(uint64_t salt, const char *name)
return (crc);
}
-static char *zfs_alloc_temp(size_t sz);
+static void *zfs_alloc(size_t size);
+static void zfs_free(void *ptr, size_t size);
typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
@@ -204,39 +294,47 @@ typedef struct raidz_col {
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_nskip; /* Skipped sectors for padding */
+ uint64_t rm_skipstart; /* Column index of padding start */
+ uintptr_t rm_reports; /* # of referencing checksum reports */
+ uint8_t rm_freed; /* map no longer has referencing ZIO */
+ uint8_t rm_ecksuminjected; /* checksum error was injected */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
-static void
-vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
-{
- uint64_t *dst, *src, xcount, ccount, count, i;
- int c;
-
- xcount = cols[x].rc_size / sizeof (src[0]);
- //ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
- //ASSERT(xcount > 0);
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
- src = cols[VDEV_RAIDZ_P].rc_data;
- dst = cols[x].rc_data;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst = *src;
- }
-
- for (c = nparity; c < acols; c++) {
- src = cols[c].rc_data;
- dst = cols[x].rc_data;
-
- if (c == x)
- continue;
-
- ccount = cols[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
- for (i = 0; i < count; i++, dst++, src++) {
- *dst ^= *src;
- }
- }
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
}
/*
@@ -321,8 +419,8 @@ vdev_raidz_exp2(uint8_t a, int exp)
if (a == 0)
return (0);
- //ASSERT(exp >= 0);
- //ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
+ ASSERT(exp >= 0);
+ ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
exp += vdev_raidz_log2[a];
if (exp > 255)
@@ -332,327 +430,1059 @@ vdev_raidz_exp2(uint8_t a, int exp)
}
static void
-vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
+vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
- uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ uint64_t *p, *src, pcount, ccount, i;
int c;
- pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
- //ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
- for (c = nparity; c < acols; c++) {
- src = cols[c].rc_data;
- p = cols[VDEV_RAIDZ_P].rc_data;
- q = cols[VDEV_RAIDZ_Q].rc_data;
- ccount = cols[c].rc_size / sizeof (src[0]);
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
- if (c == nparity) {
- //ASSERT(ccount == pcount || ccount == 0);
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- *q = *src;
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount);
+ for (i = 0; i < ccount; i++, src++, p++) {
*p = *src;
}
- for (; i < pcount; i++, p++, q++, src++) {
- *q = 0;
+ } else {
+ ASSERT(ccount <= pcount);
+ for (i = 0; i < ccount; i++, src++, p++) {
+ *p ^= *src;
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+{
+ uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
+ int c;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
+ *p = *src;
+ *q = *src;
+ }
+ for (; i < pcnt; i++, src++, p++, q++) {
*p = 0;
+ *q = 0;
}
} else {
- //ASSERT(ccount <= pcount);
+ ASSERT(ccnt <= pcnt);
/*
- * Rather than multiplying each byte
- * individually (as described above), we are
- * able to handle 8 at once by generating a
- * mask based on the high bit in each byte and
- * using that to conditionally XOR in 0x1d.
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
*/
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- *q ^= *src;
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
*p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ *q ^= *src;
}
/*
* Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
*/
- for (; i < pcount; i++, q++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (; i < pcnt; i++, q++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
}
}
}
}
static void
-vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{
- uint64_t *dst, *src, xcount, ccount, count, mask, i;
- uint8_t *b;
- int c, j, exp;
+ uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ int c;
- xcount = cols[x].rc_size / sizeof (src[0]);
- //ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_R].rc_size);
- for (c = nparity; c < acols; c++) {
- src = cols[c].rc_data;
- dst = cols[x].rc_data;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
- if (c == x)
- ccount = 0;
- else
- ccount = cols[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
-
- if (c == nparity) {
- for (i = 0; i < count; i++, dst++, src++) {
- *dst = *src;
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+ *p = *src;
+ *q = *src;
+ *r = *src;
}
- for (; i < xcount; i++, dst++) {
- *dst = 0;
+ for (; i < pcnt; i++, src++, p++, q++, r++) {
+ *p = 0;
+ *q = 0;
+ *r = 0;
}
-
} else {
+ ASSERT(ccnt <= pcnt);
+
/*
- * For an explanation of this, see the comment in
- * vdev_raidz_generate_parity_pq() above.
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
*/
- for (i = 0; i < count; i++, dst++, src++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- *dst ^= *src;
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+ *p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ *q ^= *src;
+
+ VDEV_RAIDZ_64MUL_4(*r, mask);
+ *r ^= *src;
}
- for (; i < xcount; i++, dst++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (; i < pcnt; i++, q++, r++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ VDEV_RAIDZ_64MUL_4(*r, mask);
}
}
}
+}
- src = cols[VDEV_RAIDZ_Q].rc_data;
- dst = cols[x].rc_data;
- exp = 255 - (acols - 1 - x);
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
+static void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ switch (rm->rm_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rm);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rm);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rm);
+ break;
+ default:
+ panic("invalid RAID-Z configuration");
+ }
+}
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst ^= *src;
- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
- *b = vdev_raidz_exp2(*b, exp);
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
}
}
}
-
static void
-vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
- int x, int y, void *temp_p, void *temp_q)
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
{
- uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
- void *pdata, *qdata;
- uint64_t xsize, ysize, i;
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ }
- //ASSERT(x < y);
- //ASSERT(x >= nparity);
- //ASSERT(y < acols);
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
- //ASSERT(cols[x].rc_size >= cols[y].rc_size);
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+ jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
/*
- * Move the parity data aside -- we're going to compute parity as
- * though columns x and y were full of zeros -- Pxy and Qxy. We want to
- * reuse the parity generation mechanism without trashing the actual
- * parity so we make those columns appear to be full of zeros by
- * setting their lengths to zero.
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
*/
- pdata = cols[VDEV_RAIDZ_P].rc_data;
- qdata = cols[VDEV_RAIDZ_Q].rc_data;
- xsize = cols[x].rc_size;
- ysize = cols[y].rc_size;
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
- cols[VDEV_RAIDZ_P].rc_data = temp_p;
- cols[VDEV_RAIDZ_Q].rc_data = temp_q;
- cols[x].rc_size = 0;
- cols[y].rc_size = 0;
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
- vdev_raidz_generate_parity_pq(cols, nparity, acols);
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
- cols[x].rc_size = xsize;
- cols[y].rc_size = ysize;
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
- p = pdata;
- q = qdata;
- pxy = cols[VDEV_RAIDZ_P].rc_data;
- qxy = cols[VDEV_RAIDZ_Q].rc_data;
- xd = cols[x].rc_data;
- yd = cols[y].rc_data;
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
/*
- * We now have:
- * Pxy = P + D_x + D_y
- * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
- *
- * We can then solve for D_x:
- * D_x = A * (P + Pxy) + B * (Q + Qxy)
- * where
- * A = 2^(x - y) * (2^(x - y) + 1)^-1
- * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
- *
- * With D_x in hand, we can easily solve for D_y:
- * D_y = P + Pxy + D_x
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t log, val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ log = 0; /* gcc */
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = zfs_alloc(psize);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rm->rm_cols);
+
+ src = rm->rm_col[c].rc_data;
+ ccount = rm->rm_col[c].rc_size;
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rm->rm_firstdatacol;
+ ASSERT3U(cc, >=, rm->rm_firstdatacol);
+ ASSERT3U(cc, <, rm->rm_cols);
+ ASSERT3U(cc, !=, c);
+
+ dst[j] = rm->rm_col[cc].rc_data;
+ dcount[j] = rm->rm_col[cc].rc_size;
+ }
+
+ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ zfs_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+ uint8_t *p, *pp;
+ size_t psize;
+
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ int code = 0;
+
+
+ n = rm->rm_cols - rm->rm_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
*/
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rm->rm_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rm->rm_firstdatacol;
+ }
+ }
- a = vdev_raidz_pow2[255 + x - y];
- b = vdev_raidz_pow2[255 - (acols - 1 - x)];
- tmp = 255 - vdev_raidz_log2[a ^ 1];
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rm->rm_firstdatacol);
- aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
- bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
- for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
- *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
- vdev_raidz_exp2(*q ^ *qxy, bexp);
+ code |= 1 << c;
- if (i < ysize)
- *yd = *p ^ *pxy ^ *xd;
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rm->rm_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
}
/*
- * Restore the saved parity data.
+ * Initialize the interesting rows of the matrix.
*/
- cols[VDEV_RAIDZ_P].rc_data = pdata;
- cols[VDEV_RAIDZ_Q].rc_data = qdata;
+ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ return (code);
}
static int
-vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
- off_t offset, size_t bytes)
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
{
- size_t psize = BP_GET_PSIZE(bp);
- vdev_t *kid;
- int unit_shift = vdev->v_ashift;
- int dcols = vdev->v_nchildren;
- int nparity = vdev->v_nparity;
- int missingdata, missingparity;
- int parity_errors, data_errors, unexpected_errors, total_errors;
- int parity_untried;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int ntgts;
+ int i, c;
+ int code;
+ int nbadparity, nbaddata;
+
+ /*
+ * The tgts list must already be sorted.
+ */
+ for (i = 1; i < nt; i++) {
+ ASSERT(t[i] > t[i - 1]);
+ }
+
+ nbadparity = rm->rm_firstdatacol;
+ nbaddata = rm->rm_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rm->rm_cols; c++) {
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rm->rm_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rm->rm_firstdatacol) {
+ nbaddata--;
+ } else {
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
+ uint64_t dcols, uint64_t nparity)
+{
+ raidz_map_t *rm;
uint64_t b = offset >> unit_shift;
- uint64_t s = psize >> unit_shift;
+ uint64_t s = size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, coff;
- int c, c1, bc, col, acols, devidx, asize, n, max_rc_size;
- static raidz_col_t cols[16];
- raidz_col_t *rc, *rc1;
- void *orig, *orig1, *temp_p, *temp_q;
-
- orig = orig1 = temp_p = temp_q = NULL;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
q = s / (dcols - nparity);
r = s - q * (dcols - nparity);
bc = (r == 0 ? 0 : r + nparity);
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ if (q == 0) {
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
+
+ ASSERT3U(acols, <=, scols);
+
+ rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
+
+ rm->rm_cols = acols;
+ rm->rm_scols = scols;
+ rm->rm_bigcols = bc;
+ rm->rm_skipstart = bc;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+ rm->rm_reports = 0;
+ rm->rm_freed = 0;
+ rm->rm_ecksuminjected = 0;
- acols = (q == 0 ? bc : dcols);
asize = 0;
- max_rc_size = 0;
-
- for (c = 0; c < acols; c++) {
+
+ for (c = 0; c < scols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << unit_shift;
}
- cols[c].rc_devidx = col;
- cols[c].rc_offset = coff;
- cols[c].rc_size = (q + (c < bc)) << unit_shift;
- cols[c].rc_data = NULL;
- cols[c].rc_error = 0;
- cols[c].rc_tried = 0;
- cols[c].rc_skipped = 0;
- asize += cols[c].rc_size;
- if (cols[c].rc_size > max_rc_size)
- max_rc_size = cols[c].rc_size;
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
}
- asize = roundup(asize, (nparity + 1) << unit_shift);
+ ASSERT3U(asize, ==, tot << unit_shift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+ ASSERT3U(rm->rm_nskip, <=, nparity);
- for (c = 0; c < nparity; c++) {
- cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
- }
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
- cols[c].rc_data = buf;
+ rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++)
- cols[c].rc_data = (char *)cols[c - 1].rc_data +
- cols[c - 1].rc_size;
+ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
/*
- * If all data stored spans all columns, there's a danger that
- * parity will always be on the same device and, since parity
- * isn't read during normal operation, that that device's I/O
- * bandwidth won't be used effectively. We therefore switch
- * the parity every 1MB.
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
*
- * ... at least that was, ostensibly, the theory. As a
- * practical matter unless we juggle the parity between all
- * devices evenly, we won't see any benefit. Further,
- * occasional writes that aren't a multiple of the LCM of the
- * number of children and the minimum stripe width are
- * sufficient to avoid pessimal behavior. Unfortunately, this
- * decision created an implicit on-disk format requirement
- * that we need to support for all eternity, but only for
- * single-parity RAID-Z.
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
+ }
+
+ return (rm);
+}
+
+static void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ int c;
+ size_t size;
+
+ for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
+ zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static vdev_t *
+vdev_child(vdev_t *pvd, uint64_t devidx)
+{
+ vdev_t *cvd;
+
+ STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
+ if (cvd->v_id == devidx)
+ break;
+ }
+
+ return (cvd);
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(const blkptr_t *bp, void *data)
+{
+
+ return (zio_checksum_error(bp, data, 0));
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number such failures.
+ */
+static int
+raidz_parity_verify(raidz_map_t *rm)
+{
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_col_t *rc;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ orig[c] = zfs_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig[c], rc->rc_size);
+ }
+
+ vdev_raidz_generate_parity(rm);
+
+ for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ rc->rc_error = ECKSUM;
+ ret++;
+ }
+ zfs_free(orig[c], rc->rc_size);
+ }
+
+ return (ret);
+}
+
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
+ off_t offset, int total_errors, int data_errors)
+{
+ raidz_col_t *rc;
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *tgts = &tstore[1];
+ int current, next, i, c, n;
+ int code, ret = 0;
+
+ ASSERT(total_errors < rm->rm_firstdatacol);
+
+ /*
+ * This simplifies one edge condition.
*/
- //ASSERT(acols >= 2);
- //ASSERT(cols[0].rc_size == cols[1].rc_size);
+ tgts[-1] = -1;
+
+ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ /*
+ * Initialize the targets array by finding the first n columns
+ * that contain no error.
+ *
+ * If there were no data errors, we need to ensure that we're
+ * always explicitly attempting to reconstruct at least one
+ * data column. To do this, we simply push the highest target
+ * up into the data columns.
+ */
+ for (c = 0, i = 0; i < n; i++) {
+ if (i == n - 1 && data_errors == 0 &&
+ c < rm->rm_firstdatacol) {
+ c = rm->rm_firstdatacol;
+ }
+
+ while (rm->rm_col[c].rc_error != 0) {
+ c++;
+ ASSERT3S(c, <, rm->rm_cols);
+ }
+
+ tgts[i] = c++;
+ }
+
+ /*
+ * Setting tgts[n] simplifies the other edge condition.
+ */
+ tgts[n] = rm->rm_cols;
+
+ /*
+ * These buffers were allocated in previous iterations.
+ */
+ for (i = 0; i < n - 1; i++) {
+ ASSERT(orig[i] != NULL);
+ }
+
+ orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
+
+ current = 0;
+ next = tgts[current];
+
+ while (current != n) {
+ tgts[current] = next;
+ current = 0;
- if (nparity == 1 && (offset & (1ULL << 20))) {
- devidx = cols[0].rc_devidx;
- o = cols[0].rc_offset;
- cols[0].rc_devidx = cols[1].rc_devidx;
- cols[0].rc_offset = cols[1].rc_offset;
- cols[1].rc_devidx = devidx;
- cols[1].rc_offset = o;
+ /*
+ * Save off the original data that we're going to
+ * attempt to reconstruct.
+ */
+ for (i = 0; i < n; i++) {
+ ASSERT(orig[i] != NULL);
+ c = tgts[i];
+ ASSERT3S(c, >=, 0);
+ ASSERT3S(c, <, rm->rm_cols);
+ rc = &rm->rm_col[c];
+ bcopy(rc->rc_data, orig[i], rc->rc_size);
+ }
+
+ /*
+ * Attempt a reconstruction and exit the outer loop on
+ * success.
+ */
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+ if (raidz_checksum_verify(bp, data) == 0) {
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_error == 0);
+ rc->rc_error = ECKSUM;
+ }
+
+ ret = code;
+ goto done;
+ }
+
+ /*
+ * Restore the original data.
+ */
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ bcopy(orig[i], rc->rc_data, rc->rc_size);
+ }
+
+ do {
+ /*
+ * Find the next valid column after the current
+ * position..
+ */
+ for (next = tgts[current] + 1;
+ next < rm->rm_cols &&
+ rm->rm_col[next].rc_error != 0; next++)
+ continue;
+
+ ASSERT(next <= tgts[current + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ */
+ if (next != tgts[current + 1])
+ break;
+
+ /*
+ * Otherwise, find the next valid column after
+ * the previous position.
+ */
+ for (c = tgts[current - 1] + 1;
+ rm->rm_col[c].rc_error != 0; c++)
+ continue;
+
+ tgts[current] = c;
+ current++;
+
+ } while (current != n);
+ }
+ }
+ n--;
+done:
+ for (i = n - 1; i >= 0; i--) {
+ zfs_free(orig[i], rm->rm_col[0].rc_size);
}
+ return (ret);
+}
+
+static int
+vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
+ off_t offset, size_t bytes)
+{
+ vdev_t *tvd = vd->v_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, error;
+ int unexpected_errors;
+ int parity_errors;
+ int parity_untried;
+ int data_errors;
+ int total_errors;
+ int n;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int code;
+
+ rc = NULL; /* gcc */
+ error = 0;
+
+ rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
+ vd->v_nchildren, vd->v_nparity);
+
/*
- * Iterate over the columns in reverse order so that we hit
- * the parity last -- any errors along the way will force us
- * to read the parity data.
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity.
*/
- missingdata = 0;
- missingparity = 0;
- for (c = acols - 1; c >= 0; c--) {
- rc = &cols[c];
- devidx = rc->rc_devidx;
- STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
- if (kid->v_id == devidx)
- break;
- if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
- if (c >= nparity)
- missingdata++;
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vdev_child(vd, rc->rc_devidx);
+ if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
else
- missingparity++;
+ rm->rm_missingparity++;
rc->rc_error = ENXIO;
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
-#if 0
- /*
- * Too hard for the bootcode
- */
- if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
- if (c >= nparity)
+#if 0 /* XXX: Too hard for the boot code. */
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+ if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
rm->rm_missingparity++;
@@ -661,28 +1491,31 @@ vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
continue;
}
#endif
- if (c >= nparity || missingdata > 0) {
- if (rc->rc_data)
- rc->rc_error = kid->v_read(kid, NULL,
- rc->rc_data, rc->rc_offset, rc->rc_size);
- else
- rc->rc_error = ENXIO;
+ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
+ rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
+ rc->rc_offset, rc->rc_size);
rc->rc_tried = 1;
rc->rc_skipped = 0;
}
}
reconstruct:
+ unexpected_errors = 0;
parity_errors = 0;
+ parity_untried = 0;
data_errors = 0;
- unexpected_errors = 0;
total_errors = 0;
- parity_untried = 0;
- for (c = 0; c < acols; c++) {
- rc = &cols[c];
+
+ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
+ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
if (rc->rc_error) {
- if (c < nparity)
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ if (c < rm->rm_firstdatacol)
parity_errors++;
else
data_errors++;
@@ -691,7 +1524,7 @@ reconstruct:
unexpected_errors++;
total_errors++;
- } else if (c < nparity && !rc->rc_tried) {
+ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
}
@@ -702,26 +1535,38 @@ reconstruct:
* 2. read all disks and try again
* 3. perform combinatorial reconstruction
*
- * Each phase is progressively both more expensive and less
- * likely to occur. If we encounter more errors than we can
- * repair or all phases fail, we have no choice but to return
- * an error.
+ * Each phase is progressively both more expensive and less likely to
+ * occur. If we encounter more errors than we can repair or all phases
+ * fail, we have no choice but to return an error.
*/
/*
- * If the number of errors we saw was correctable -- less than
- * or equal to the number of parity disks read -- attempt to
- * produce data that has a valid checksum. Naturally, this
- * case applies in the absence of any errors.
+ * If the number of errors we saw was correctable -- less than or equal
+ * to the number of parity disks read -- attempt to produce data that
+ * has a valid checksum. Naturally, this case applies in the absence of
+ * any errors.
*/
- if (total_errors <= nparity - parity_untried) {
- switch (data_errors) {
- case 0:
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
- break;
-
- case 1:
+ if (total_errors <= rm->rm_firstdatacol - parity_untried) {
+ if (data_errors == 0) {
+ if (raidz_checksum_verify(bp, data) == 0) {
+ /*
+ * If we read parity information (unnecessarily
+ * as it happens since no reconstruction was
+ * needed) regenerate and verify the parity.
+ * We also regenerate parity when resilvering
+ * so we can write it out to the failed device
+ * later.
+ */
+ if (parity_errors + parity_untried <
+ rm->rm_firstdatacol) {
+ n = raidz_parity_verify(rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+ goto done;
+ }
+ } else {
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
@@ -729,74 +1574,48 @@ reconstruct:
* also have been fewer parity errors than parity
* columns or, again, we wouldn't be in this code path.
*/
- //ASSERT(parity_untried == 0);
- //ASSERT(parity_errors < nparity);
-
- /*
- * Find the column that reported the error.
- */
- for (c = nparity; c < acols; c++) {
- rc = &cols[c];
- if (rc->rc_error != 0)
- break;
- }
- //ASSERT(c != acols);
- //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
-
- if (cols[VDEV_RAIDZ_P].rc_error == 0) {
- vdev_raidz_reconstruct_p(cols, nparity,
- acols, c);
- } else {
- //ASSERT(nparity > 1);
- vdev_raidz_reconstruct_q(cols, nparity,
- acols, c);
- }
-
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
- break;
-
- case 2:
- /*
- * Two data column errors require double parity.
- */
- //ASSERT(nparity == 2);
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rm->rm_firstdatacol);
/*
- * Find the two columns that reported errors.
+ * Identify the data columns that reported an error.
*/
- for (c = nparity; c < acols; c++) {
- rc = &cols[c];
- if (rc->rc_error != 0)
- break;
+ n = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
}
- //ASSERT(c != acols);
- //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
- for (c1 = c++; c < acols; c++) {
- rc = &cols[c];
- if (rc->rc_error != 0)
- break;
+ ASSERT(rm->rm_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+
+ if (raidz_checksum_verify(bp, data) == 0) {
+ /*
+ * If we read more parity disks than were used
+ * for reconstruction, confirm that the other
+ * parity disks produced correct data. This
+ * routine is suboptimal in that it regenerates
+ * the parity that we already used in addition
+ * to the parity that we're attempting to
+ * verify, but this should be a relatively
+ * uncommon case, and can be optimized if it
+ * becomes a problem. Note that we regenerate
+ * parity when resilvering so we can write it
+ * out to failed devices later.
+ */
+ if (parity_errors < rm->rm_firstdatacol - n) {
+ n = raidz_parity_verify(rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+
+ goto done;
}
- //ASSERT(c != acols);
- //ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
-
- if (temp_p == NULL)
- temp_p = zfs_alloc_temp(max_rc_size);
- if (temp_q == NULL)
- temp_q = zfs_alloc_temp(max_rc_size);
-
- vdev_raidz_reconstruct_pq(cols, nparity, acols,
- c1, c, temp_p, temp_q);
-
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
- break;
-
- default:
- break;
- //ASSERT(nparity <= 2);
- //ASSERT(0);
}
}
@@ -808,38 +1627,29 @@ reconstruct:
* before, all children will be marked as tried so we'll
* proceed to combinatorial reconstruction.
*/
+ unexpected_errors = 1;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+
n = 0;
- for (c = 0; c < acols; c++) {
- rc = &cols[c];
- if (rc->rc_tried)
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_tried)
continue;
- devidx = rc->rc_devidx;
- STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
- if (kid->v_id == devidx)
- break;
- if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
- rc->rc_error = ENXIO;
- rc->rc_tried = 1; /* don't even try */
- rc->rc_skipped = 1;
- continue;
- }
- if (rc->rc_data)
- rc->rc_error = kid->v_read(kid, NULL,
- rc->rc_data, rc->rc_offset, rc->rc_size);
- else
- rc->rc_error = ENXIO;
+ cvd = vdev_child(vd, rc->rc_devidx);
+ ASSERT(cvd != NULL);
+ rc->rc_error = cvd->v_read(cvd, NULL,
+ rc->rc_data, rc->rc_offset, rc->rc_size);
if (rc->rc_error == 0)
n++;
rc->rc_tried = 1;
rc->rc_skipped = 0;
}
-
/*
* If we managed to read anything more, retry the
* reconstruction.
*/
- if (n)
+ if (n > 0)
goto reconstruct;
/*
@@ -847,88 +1657,41 @@ reconstruct:
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. Before we attempt combinatorial reconstruction make
- * sure we have a chance of coming up with the right answer.
+ * in absent data. We check if there is enough additional data to
+ * possibly reconstruct the data and then perform combinatorial
+ * reconstruction over all possible combinations. If that fails,
+ * we're cooked.
*/
- if (total_errors >= nparity) {
- return (EIO);
- }
-
- if (cols[VDEV_RAIDZ_P].rc_error == 0) {
+ if (total_errors > rm->rm_firstdatacol) {
+ error = EIO;
+ } else if (total_errors < rm->rm_firstdatacol &&
+ (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
+ data_errors)) != 0) {
/*
- * Attempt to reconstruct the data from parity P.
+ * If we didn't use all the available parity for the
+ * combinatorial reconstruction, verify that the remaining
+ * parity is correct.
*/
- if (orig == NULL)
- orig = zfs_alloc_temp(max_rc_size);
- for (c = nparity; c < acols; c++) {
- rc = &cols[c];
-
- memcpy(orig, rc->rc_data, rc->rc_size);
- vdev_raidz_reconstruct_p(cols, nparity, acols, c);
-
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
-
- memcpy(rc->rc_data, orig, rc->rc_size);
- }
- }
-
- if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
+ if (code != (1 << rm->rm_firstdatacol) - 1)
+ (void) raidz_parity_verify(rm);
+ } else {
/*
- * Attempt to reconstruct the data from parity Q.
+ * We're here because either:
+ *
+ * total_errors == rm_first_datacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
*/
- if (orig == NULL)
- orig = zfs_alloc_temp(max_rc_size);
- for (c = nparity; c < acols; c++) {
- rc = &cols[c];
-
- memcpy(orig, rc->rc_data, rc->rc_size);
- vdev_raidz_reconstruct_q(cols, nparity, acols, c);
-
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
-
- memcpy(rc->rc_data, orig, rc->rc_size);
- }
+ error = ECKSUM;
}
- if (nparity > 1 &&
- cols[VDEV_RAIDZ_P].rc_error == 0 &&
- cols[VDEV_RAIDZ_Q].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from both P and Q.
- */
- if (orig == NULL)
- orig = zfs_alloc_temp(max_rc_size);
- if (orig1 == NULL)
- orig1 = zfs_alloc_temp(max_rc_size);
- if (temp_p == NULL)
- temp_p = zfs_alloc_temp(max_rc_size);
- if (temp_q == NULL)
- temp_q = zfs_alloc_temp(max_rc_size);
- for (c = nparity; c < acols - 1; c++) {
- rc = &cols[c];
-
- memcpy(orig, rc->rc_data, rc->rc_size);
-
- for (c1 = c + 1; c1 < acols; c1++) {
- rc1 = &cols[c1];
-
- memcpy(orig1, rc1->rc_data, rc1->rc_size);
-
- vdev_raidz_reconstruct_pq(cols, nparity,
- acols, c, c1, temp_p, temp_q);
-
- if (zio_checksum_error(bp, buf) == 0)
- return (0);
+done:
+ vdev_raidz_map_free(rm);
- memcpy(rc1->rc_data, orig1, rc1->rc_size);
- }
-
- memcpy(rc->rc_data, orig, rc->rc_size);
- }
- }
-
- return (EIO);
+ return (error);
}
-
diff --git a/sys/cddl/boot/zfs/zle.c b/sys/cddl/boot/zfs/zle.c
new file mode 100644
index 000000000000..361b1b29ee9c
--- /dev/null
+++ b/sys/cddl/boot/zfs/zle.c
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+
+static int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ unsigned char *src = s_start;
+ unsigned char *dst = d_start;
+ unsigned char *s_end = src + s_len;
+ unsigned char *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
index 72d155728681..4aba9ea6cfe5 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
@@ -83,8 +83,7 @@ atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
return (newval);
}
-#if defined(__sparc64__) || defined(__powerpc__) || defined(__arm__) || \
- defined(__mips__)
+#if defined(__powerpc__) || defined(__arm__) || defined(__mips__)
void
atomic_or_8(volatile uint8_t *target, uint8_t value)
{
@@ -105,27 +104,23 @@ atomic_or_8_nv(volatile uint8_t *target, uint8_t value)
return (newval);
}
-#ifndef __LP64__
-void *
-atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+uint64_t
+atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
{
- void *oldval, **trg;
+ uint64_t oldval;
mtx_lock(&atomic_mtx);
- trg = __DEVOLATILE(void **, target);
- oldval = *trg;
+ oldval = *target;
if (oldval == cmp)
- *trg = newval;
+ *target = newval;
mtx_unlock(&atomic_mtx);
return (oldval);
}
-#endif
-#ifndef __sparc64__
-uint64_t
-atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
+uint32_t
+atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval)
{
- uint64_t oldval;
+ uint32_t oldval;
mtx_lock(&atomic_mtx);
oldval = *target;
@@ -134,7 +129,6 @@ atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
mtx_unlock(&atomic_mtx);
return (oldval);
}
-#endif
void
membar_producer(void)
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
index 5d4ba062a982..3bcbc0ba02f6 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
@@ -46,7 +46,7 @@ __FBSDID("$FreeBSD$");
#endif
#ifdef _KERNEL
-static MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
+MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
#else
#define malloc(size, type, flags) malloc(size)
#define free(addr, type) free(addr)
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
index 4ac666dfbe67..595f3c0b3c55 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c
@@ -33,40 +33,22 @@ __FBSDID("$FreeBSD$");
#include <sys/libkern.h>
#include <sys/limits.h>
#include <sys/misc.h>
-#include <sys/sunddi.h>
+#include <sys/sysctl.h>
char hw_serial[11] = "0";
struct opensolaris_utsname utsname = {
- .nodename = "unset",
- .sysname = "SunOS"
+ .machine = MACHINE
};
-int
-ddi_strtol(const char *str, char **nptr, int base, long *result)
+static void
+opensolaris_utsname_init(void *arg)
{
- *result = strtol(str, nptr, base);
- if (*result == 0)
- return (EINVAL);
- else if (*result == LONG_MIN || *result == LONG_MAX)
- return (ERANGE);
- return (0);
-}
-
-int
-ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
-{
-
- if (str == hw_serial) {
- *result = prison0.pr_hostid;
- return (0);
- }
-
- *result = strtoul(str, nptr, base);
- if (*result == 0)
- return (EINVAL);
- else if (*result == ULONG_MAX)
- return (ERANGE);
- return (0);
+ utsname.sysname = ostype;
+ utsname.nodename = prison0.pr_hostname;
+ utsname.release = osrelease;
+ snprintf(utsname.version, sizeof(utsname.version), "%d", osreldate);
}
+SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
+ opensolaris_utsname_init, NULL);
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
index 3b22c44082eb..019f29fde8c6 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c
@@ -38,47 +38,47 @@ __FBSDID("$FreeBSD$");
#include <sys/zfs_vfsops.h>
int
-secpolicy_nfs(struct ucred *cred)
+secpolicy_nfs(cred_t *cr)
{
- return (priv_check_cred(cred, PRIV_NFS_DAEMON, 0));
+ return (priv_check_cred(cr, PRIV_NFS_DAEMON, 0));
}
int
-secpolicy_zfs(struct ucred *cred)
+secpolicy_zfs(cred_t *cr)
{
- return (priv_check_cred(cred, PRIV_VFS_MOUNT, 0));
+ return (priv_check_cred(cr, PRIV_VFS_MOUNT, 0));
}
int
-secpolicy_sys_config(struct ucred *cred, int checkonly __unused)
+secpolicy_sys_config(cred_t *cr, int checkonly __unused)
{
- return (priv_check_cred(cred, PRIV_ZFS_POOL_CONFIG, 0));
+ return (priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG, 0));
}
int
-secpolicy_zinject(struct ucred *cred)
+secpolicy_zinject(cred_t *cr)
{
- return (priv_check_cred(cred, PRIV_ZFS_INJECT, 0));
+ return (priv_check_cred(cr, PRIV_ZFS_INJECT, 0));
}
int
-secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp __unused)
+secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
{
- return (priv_check_cred(cred, PRIV_VFS_UNMOUNT, 0));
+ return (priv_check_cred(cr, PRIV_VFS_UNMOUNT, 0));
}
int
-secpolicy_fs_owner(struct mount *mp, struct ucred *cred)
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
{
if (zfs_super_owner) {
- if (cred->cr_uid == mp->mnt_cred->cr_uid &&
- cred->cr_prison == mp->mnt_cred->cr_prison) {
+ if (cr->cr_uid == mp->mnt_cred->cr_uid &&
+ cr->cr_prison == mp->mnt_cred->cr_prison) {
return (0);
}
}
@@ -90,75 +90,129 @@ secpolicy_fs_owner(struct mount *mp, struct ucred *cred)
*/
extern int hardlink_check_uid;
int
-secpolicy_basic_link(struct vnode *vp, struct ucred *cred)
+secpolicy_basic_link(vnode_t *vp, cred_t *cr)
{
if (!hardlink_check_uid)
return (0);
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_LINK, 0));
+ return (priv_check_cred(cr, PRIV_VFS_LINK, 0));
}
int
-secpolicy_vnode_stky_modify(struct ucred *cred)
+secpolicy_vnode_stky_modify(cred_t *cr)
{
return (EPERM);
}
int
-secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred)
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
{
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0));
+ return (priv_check_cred(cr, PRIV_VFS_ADMIN, 0));
}
int
-secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner,
- accmode_t accmode)
+secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
{
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- if ((accmode & VREAD) && priv_check_cred(cred, PRIV_VFS_READ, 0) != 0)
+ if ((accmode & VREAD) && priv_check_cred(cr, PRIV_VFS_READ, 0) != 0)
return (EACCES);
if ((accmode & VWRITE) &&
- priv_check_cred(cred, PRIV_VFS_WRITE, 0) != 0) {
+ priv_check_cred(cr, PRIV_VFS_WRITE, 0) != 0) {
return (EACCES);
}
if (accmode & VEXEC) {
if (vp->v_type == VDIR) {
- if (priv_check_cred(cred, PRIV_VFS_LOOKUP, 0) != 0) {
+ if (priv_check_cred(cr, PRIV_VFS_LOOKUP, 0) != 0)
return (EACCES);
- }
} else {
- if (priv_check_cred(cred, PRIV_VFS_EXEC, 0) != 0) {
+ if (priv_check_cred(cr, PRIV_VFS_EXEC, 0) != 0)
return (EACCES);
- }
}
}
return (0);
}
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
+int
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+ accmode_t curmode, accmode_t wantmode)
+{
+ accmode_t mode;
+
+ mode = ~curmode & wantmode;
+
+ if (mode == 0)
+ return (0);
+
+ return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+ static int privs[] = {
+ PRIV_VFS_ADMIN,
+ PRIV_VFS_READ,
+ PRIV_VFS_WRITE,
+ PRIV_VFS_EXEC,
+ PRIV_VFS_LOOKUP
+ };
+ int i;
+
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ return (0);
+
+ /* Same as secpolicy_vnode_setdac */
+ if (owner == cr->cr_uid)
+ return (0);
+
+ for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
+ boolean_t allzone = B_FALSE;
+ int priv;
+
+ switch (priv = privs[i]) {
+ case PRIV_VFS_EXEC:
+ if (vp->v_type == VDIR)
+ continue;
+ break;
+ case PRIV_VFS_LOOKUP:
+ if (vp->v_type != VDIR)
+ continue;
+ break;
+ }
+ if (priv_check_cred(cr, priv, 0) == 0)
+ return (0);
+ }
+ return (EPERM);
+}
+
int
-secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred, uid_t owner)
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
{
- if (owner == cred->cr_uid)
+ if (owner == cr->cr_uid)
return (0);
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0));
+ return (priv_check_cred(cr, PRIV_VFS_ADMIN, 0));
}
int
-secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
+secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
const struct vattr *ovap, int flags,
- int unlocked_access(void *, int, struct ucred *), void *node)
+ int unlocked_access(void *, int, cred_t *), void *node)
{
int mask = vap->va_mask;
int error;
@@ -166,7 +220,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
if (mask & AT_SIZE) {
if (vp->v_type == VDIR)
return (EISDIR);
- error = unlocked_access(node, VWRITE, cred);
+ error = unlocked_access(node, VWRITE, cr);
if (error)
return (error);
}
@@ -179,17 +233,17 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
* In the specific case of creating a set-uid root
* file, we need even more permissions.
*/
- error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
if (error)
return (error);
- error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cred);
+ error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
if (error)
return (error);
} else {
vap->va_mode = ovap->va_mode;
}
if (mask & (AT_UID | AT_GID)) {
- error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
if (error)
return (error);
@@ -200,9 +254,9 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
*/
if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
- !groupmember(vap->va_gid, cred))) {
- if (secpolicy_fs_owner(vp->v_mount, cred) != 0) {
- error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
+ !groupmember(vap->va_gid, cr))) {
+ if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
+ error = priv_check_cred(cr, PRIV_VFS_CHOWN, 0);
if (error)
return (error);
}
@@ -210,7 +264,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
- secpolicy_setid_clear(vap, vp, cred);
+ secpolicy_setid_clear(vap, vp, cr);
}
}
if (mask & (AT_ATIME | AT_MTIME)) {
@@ -222,9 +276,9 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
* If times is non-NULL, ... The caller must be the owner of
* the file or be the super-user.
*/
- error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid);
+ error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
if (error && (vap->va_vaflags & VA_UTIMES_NULL))
- error = unlocked_access(node, VWRITE, cred);
+ error = unlocked_access(node, VWRITE, cr);
if (error)
return (error);
}
@@ -232,41 +286,42 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap,
}
int
-secpolicy_vnode_create_gid(struct ucred *cred)
+secpolicy_vnode_create_gid(cred_t *cr)
{
return (EPERM);
}
int
-secpolicy_vnode_setids_setgids(vnode_t *vp, struct ucred *cred, gid_t gid)
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
{
- if (groupmember(gid, cred))
+
+ if (groupmember(gid, cr))
return (0);
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_SETGID, 0));
+ return (priv_check_cred(cr, PRIV_VFS_SETGID, 0));
}
int
-secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred,
+secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
boolean_t issuidroot __unused)
{
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0));
+ return (priv_check_cred(cr, PRIV_VFS_RETAINSUGID, 0));
}
void
-secpolicy_setid_clear(struct vattr *vap, struct vnode *vp, struct ucred *cred)
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
{
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return;
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
- if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+ if (priv_check_cred(cr, PRIV_VFS_RETAINSUGID, 0)) {
vap->va_mask |= AT_MODE;
vap->va_mode &= ~(S_ISUID|S_ISGID);
}
@@ -274,12 +329,12 @@ secpolicy_setid_clear(struct vattr *vap, struct vnode *vp, struct ucred *cred)
}
int
-secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
- const struct vattr *ovap, struct ucred *cred)
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, cred_t *cr)
{
int error;
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
/*
@@ -288,7 +343,7 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
* is not a member of. Both of these are allowed in jail(8).
*/
if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
- if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
+ if (priv_check_cred(cr, PRIV_VFS_STICKYFILE, 0))
return (EFTYPE);
}
/*
@@ -296,15 +351,15 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
* group-id bit.
*/
if ((vap->va_mode & S_ISGID) != 0) {
- error = secpolicy_vnode_setids_setgids(vp, cred, ovap->va_gid);
+ error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
if (error)
return (error);
}
/*
* Deny setting setuid if we are not the file owner.
*/
- if ((vap->va_mode & S_ISUID) && ovap->va_uid != cred->cr_uid) {
- error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
+ if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
+ error = priv_check_cred(cr, PRIV_VFS_ADMIN, 0);
if (error)
return (error);
}
@@ -319,25 +374,25 @@ secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
}
int
-secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner)
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
{
- if (owner == cred->cr_uid)
+ if (owner == cr->cr_uid)
return (0);
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
/* XXX: vfs_suser()? */
- return (priv_check_cred(cred, PRIV_VFS_MOUNT_OWNER, 0));
+ return (priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER, 0));
}
int
-secpolicy_vnode_chown(struct vnode *vp, cred_t *cred, uid_t owner)
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
{
- if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+ if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
return (0);
- return (priv_check_cred(cred, PRIV_VFS_CHOWN, 0));
+ return (priv_check_cred(cr, PRIV_VFS_CHOWN, 0));
}
void
@@ -357,7 +412,7 @@ secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
* Check privileges for setting xvattr attributes
*/
int
-secpolicy_xvattr(struct vnode *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
vtype_t vtype)
{
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_string.c b/sys/cddl/compat/opensolaris/kern/opensolaris_string.c
index de9169e9df2e..2150608611cf 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_string.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_string.c
@@ -27,6 +27,8 @@
#include <sys/param.h>
#include <sys/string.h>
+#include <sys/kmem.h>
+#include <machine/stdarg.h>
#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9')
@@ -71,3 +73,34 @@ strident_canon(char *s, size_t n)
}
*s = 0;
}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ char *buf;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx) + 1;
+ va_end(adx);
+
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, size, fmt, adx);
+ va_end(adx);
+
+ return (buf);
+}
+
+void
+strfree(char *str)
+{
+ ASSERT(str != NULL);
+ kmem_free(str, strlen(str) + 1);
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c b/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c
new file mode 100644
index 000000000000..bb56909ebe90
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c
@@ -0,0 +1,198 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sunddi.h>
+#include <sys/sysctl.h>
+
+int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+ *result = strtol(str, nptr, base);
+ if (*result == 0)
+ return (EINVAL);
+ else if (*result == LONG_MIN || *result == LONG_MAX)
+ return (ERANGE);
+ return (0);
+}
+
+int
+ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
+{
+
+ if (str == hw_serial) {
+ *result = prison0.pr_hostid;
+ return (0);
+ }
+
+ *result = strtoul(str, nptr, base);
+ if (*result == 0)
+ return (EINVAL);
+ else if (*result == ULONG_MAX)
+ return (ERANGE);
+ return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+ *result = (unsigned long long)strtouq(str, nptr, base);
+ if (*result == 0)
+ return (EINVAL);
+ else if (*result == ULLONG_MAX)
+ return (ERANGE);
+ return (0);
+}
+
+struct ddi_soft_state_item {
+ int ssi_item;
+ void *ssi_data;
+ LIST_ENTRY(ddi_soft_state_item) ssi_next;
+};
+
+struct ddi_soft_state {
+ size_t ss_size;
+ kmutex_t ss_lock;
+ LIST_HEAD(, ddi_soft_state_item) ss_list;
+};
+
+static void *
+ddi_get_soft_state_locked(struct ddi_soft_state *ss, int item)
+{
+ struct ddi_soft_state_item *itemp;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ LIST_FOREACH(itemp, &ss->ss_list, ssi_next) {
+ if (itemp->ssi_item == item)
+ return (itemp->ssi_data);
+ }
+ return (NULL);
+}
+
+void *
+ddi_get_soft_state(void *state, int item)
+{
+ struct ddi_soft_state *ss = state;
+ void *data;
+
+ mutex_enter(&ss->ss_lock);
+ data = ddi_get_soft_state_locked(ss, item);
+ mutex_exit(&ss->ss_lock);
+ return (data);
+}
+
+int
+ddi_soft_state_zalloc(void *state, int item)
+{
+ struct ddi_soft_state *ss = state;
+ struct ddi_soft_state_item *itemp;
+
+ itemp = kmem_alloc(sizeof(*itemp), KM_SLEEP);
+ itemp->ssi_item = item;
+ itemp->ssi_data = kmem_zalloc(ss->ss_size, KM_SLEEP);
+
+ mutex_enter(&ss->ss_lock);
+ if (ddi_get_soft_state_locked(ss, item) != NULL) {
+ mutex_exit(&ss->ss_lock);
+ kmem_free(itemp->ssi_data, ss->ss_size);
+ kmem_free(itemp, sizeof(*itemp));
+ return (DDI_FAILURE);
+ }
+ LIST_INSERT_HEAD(&ss->ss_list, itemp, ssi_next);
+ mutex_exit(&ss->ss_lock);
+ return (DDI_SUCCESS);
+}
+
+static void
+ddi_soft_state_free_locked(struct ddi_soft_state *ss, int item)
+{
+ struct ddi_soft_state_item *itemp;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ LIST_FOREACH(itemp, &ss->ss_list, ssi_next) {
+ if (itemp->ssi_item == item)
+ break;
+ }
+ if (itemp != NULL) {
+ LIST_REMOVE(itemp, ssi_next);
+ kmem_free(itemp->ssi_data, ss->ss_size);
+ kmem_free(itemp, sizeof(*itemp));
+ }
+}
+
+void
+ddi_soft_state_free(void *state, int item)
+{
+ struct ddi_soft_state *ss = state;
+
+ mutex_enter(&ss->ss_lock);
+ ddi_soft_state_free_locked(ss, item);
+ mutex_exit(&ss->ss_lock);
+}
+
+int
+ddi_soft_state_init(void **statep, size_t size, size_t nitems __unused)
+{
+ struct ddi_soft_state *ss;
+
+ ss = kmem_alloc(sizeof(*ss), KM_SLEEP);
+ mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+ ss->ss_size = size;
+ LIST_INIT(&ss->ss_list);
+ *statep = ss;
+ return (0);
+}
+
+void
+ddi_soft_state_fini(void **statep)
+{
+ struct ddi_soft_state *ss = *statep;
+ struct ddi_soft_state_item *itemp;
+ int item;
+
+ mutex_enter(&ss->ss_lock);
+ while ((itemp = LIST_FIRST(&ss->ss_list)) != NULL) {
+ item = itemp->ssi_item;
+ ddi_soft_state_free_locked(ss, item);
+ }
+ mutex_exit(&ss->ss_lock);
+ mutex_destroy(&ss->ss_lock);
+ kmem_free(ss, sizeof(*ss));
+
+ *statep = NULL;
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c b/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c
new file mode 100644
index 000000000000..dea618cf5e34
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c
@@ -0,0 +1,334 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/sbuf.h>
+#include <sys/bus.h>
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+
+struct sysevent {
+ nvlist_t *se_nvl;
+ char se_class[128];
+ char se_subclass[128];
+ char se_pub[128];
+};
+
+sysevent_t *
+sysevent_alloc(char *class, char *subclass, char *pub, int flag)
+{
+ struct sysevent *ev;
+
+ ASSERT(class != NULL);
+ ASSERT(subclass != NULL);
+ ASSERT(pub != NULL);
+ ASSERT(flag == SE_SLEEP);
+
+ ev = kmem_alloc(sizeof(*ev), KM_SLEEP);
+ ev->se_nvl = NULL;
+ strlcpy(ev->se_class, class, sizeof(ev->se_class));
+ strlcpy(ev->se_subclass, subclass, sizeof(ev->se_subclass));
+ strlcpy(ev->se_pub, pub, sizeof(ev->se_pub));
+
+ return ((sysevent_t *)ev);
+}
+
+void
+sysevent_free(sysevent_t *evp)
+{
+ struct sysevent *ev = (struct sysevent *)evp;
+
+ ASSERT(evp != NULL);
+
+ if (ev->se_nvl != NULL)
+ sysevent_free_attr(ev->se_nvl);
+ kmem_free(ev, sizeof(*ev));
+}
+
+int
+sysevent_add_attr(sysevent_attr_list_t **ev_attr_list, char *name,
+ sysevent_value_t *se_value, int flag)
+{
+ nvlist_t *nvl;
+ int error;
+
+ ASSERT(ev_attr_list != NULL);
+ ASSERT(name != NULL);
+ ASSERT(se_value != NULL);
+ ASSERT(flag == SE_SLEEP);
+
+ if (strlen(name) >= MAX_ATTR_NAME)
+ return (SE_EINVAL);
+
+ nvl = *ev_attr_list;
+ if (nvl == NULL) {
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0)
+ return (SE_ENOMEM);
+ }
+
+ error = 0;
+
+ switch (se_value->value_type) {
+ case SE_DATA_TYPE_UINT64:
+ error = nvlist_add_uint64(nvl, name, se_value->value.sv_uint64);
+ break;
+ case SE_DATA_TYPE_STRING:
+ if (strlen(se_value->value.sv_string) >= MAX_STRING_SZ)
+ error = SE_EINVAL;
+ if (error == 0) {
+ error = nvlist_add_string(nvl, name,
+ se_value->value.sv_string);
+ }
+ break;
+ default:
+ printf("%s: type %d is not implemented\n", __func__,
+ se_value->value_type);
+ break;
+ }
+
+ if (error != 0) {
+ nvlist_free(nvl);
+ return (error);
+ }
+
+ *ev_attr_list = nvl;
+
+ return (0);
+}
+
+void
+sysevent_free_attr(sysevent_attr_list_t *ev_attr_list)
+{
+
+ nvlist_free(ev_attr_list);
+}
+
+int
+sysevent_attach_attributes(sysevent_t *evp, sysevent_attr_list_t *ev_attr_list)
+{
+ struct sysevent *ev = (struct sysevent *)evp;
+
+ ASSERT(ev->se_nvl == NULL);
+
+ ev->se_nvl = ev_attr_list;
+
+ return (0);
+}
+
+void
+sysevent_detach_attributes(sysevent_t *evp)
+{
+ struct sysevent *ev = (struct sysevent *)evp;
+
+ ASSERT(ev->se_nvl != NULL);
+
+ ev->se_nvl = NULL;
+}
+
+int
+log_sysevent(sysevent_t *evp, int flag, sysevent_id_t *eid)
+{
+ struct sysevent *ev = (struct sysevent *)evp;
+ struct sbuf *sb;
+ const char *type;
+ char typestr[128];
+ nvpair_t *elem = NULL;
+
+ ASSERT(evp != NULL);
+ ASSERT(ev->se_nvl != NULL);
+ ASSERT(flag == SE_SLEEP);
+ ASSERT(eid != NULL);
+
+ sb = sbuf_new_auto();
+ if (sb == NULL)
+ return (SE_ENOMEM);
+ type = NULL;
+
+ while ((elem = nvlist_next_nvpair(ev->se_nvl, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_BOOLEAN:
+ {
+ boolean_t value;
+
+ (void) nvpair_value_boolean_value(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+ value ? "true" : "false");
+ break;
+ }
+ case DATA_TYPE_UINT8:
+ {
+ uint8_t value;
+
+ (void) nvpair_value_uint8(elem, &value);
+ sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+ break;
+ }
+ case DATA_TYPE_INT32:
+ {
+ int32_t value;
+
+ (void) nvpair_value_int32(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT32:
+ {
+ uint32_t value;
+
+ (void) nvpair_value_uint32(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_INT64:
+ {
+ int64_t value;
+
+ (void) nvpair_value_int64(elem, &value);
+ sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+ (intmax_t)value);
+ break;
+ }
+ case DATA_TYPE_UINT64:
+ {
+ uint64_t value;
+
+ (void) nvpair_value_uint64(elem, &value);
+ sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+ (uintmax_t)value);
+ break;
+ }
+ case DATA_TYPE_STRING:
+ {
+ char *value;
+
+ (void) nvpair_value_string(elem, &value);
+ sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+ if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+ type = value;
+ break;
+ }
+ case DATA_TYPE_UINT8_ARRAY:
+ {
+ uint8_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint8_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%02hhx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT16_ARRAY:
+ {
+ uint16_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint16_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%04hx", value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT32_ARRAY:
+ {
+ uint32_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint32_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+ break;
+ }
+ case DATA_TYPE_UINT64_ARRAY:
+ {
+ uint64_t *value;
+ uint_t ii, nelem;
+
+ (void) nvpair_value_uint64_array(elem, &value, &nelem);
+ sbuf_printf(sb, " %s=", nvpair_name(elem));
+ for (ii = 0; ii < nelem; ii++)
+ sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+ break;
+ }
+ default:
+ printf("%s: type %d is not implemented\n", __func__,
+ nvpair_type(elem));
+ break;
+ }
+ }
+
+ if (sbuf_finish(sb) != 0) {
+ sbuf_delete(sb);
+ return (SE_ENOMEM);
+ }
+
+ if (type == NULL)
+ type = ev->se_subclass;
+ if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+ snprintf(typestr, sizeof(typestr), "misc.fs.zfs.%s", type + 8);
+ type = typestr;
+ }
+ devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+ sbuf_delete(sb);
+
+ return (0);
+}
+
+int
+_ddi_log_sysevent(char *vendor, char *class, char *subclass,
+ nvlist_t *attr_list, sysevent_id_t *eidp, int flag)
+{
+ sysevent_t *ev;
+ int ret;
+
+ ASSERT(vendor != NULL);
+ ASSERT(class != NULL);
+ ASSERT(subclass != NULL);
+ ASSERT(attr_list != NULL);
+ ASSERT(eidp != NULL);
+ ASSERT(flag == DDI_SLEEP);
+
+ ev = sysevent_alloc(class, subclass, vendor, SE_SLEEP);
+ ASSERT(ev != NULL);
+ (void)sysevent_attach_attributes(ev, attr_list);
+ ret = log_sysevent(ev, SE_SLEEP, eidp);
+ sysevent_detach_attributes(ev);
+ sysevent_free(ev);
+
+ return (ret);
+}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c b/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
index f7b31db6f825..5a204884b095 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
@@ -115,12 +115,17 @@ taskqid_t
taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
{
struct ostask *task;
- int mflag;
+ int mflag, prio;
if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
mflag = M_WAITOK;
else
mflag = M_NOWAIT;
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
task = uma_zalloc(taskq_zone, mflag);
if (task == NULL)
@@ -129,7 +134,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
task->ost_func = func;
task->ost_arg = arg;
- TASK_INIT(&task->ost_task, 0, taskq_run, task);
+ TASK_INIT(&task->ost_task, prio, taskq_run, task);
taskqueue_enqueue(tq->tq_queue, &task->ost_task);
return ((taskqid_t)(void *)task);
@@ -148,17 +153,24 @@ taskq_run_safe(void *arg, int pending __unused)
}
taskqid_t
-taskq_dispatch_safe(taskq_t *tq, task_func_t func, void *arg,
+taskq_dispatch_safe(taskq_t *tq, task_func_t func, void *arg, u_int flags,
struct ostask *task)
{
+ int prio;
ASSERT(task->ost_magic != TASKQ_MAGIC);
+ /*
+ * If TQ_FRONT is given, we want higher priority for this task, so it
+ * can go at the front of the queue.
+ */
+ prio = !!(flags & TQ_FRONT);
+
task->ost_magic = TASKQ_MAGIC;
task->ost_func = func;
task->ost_arg = arg;
- TASK_INIT(&task->ost_task, 0, taskq_run_safe, task);
+ TASK_INIT(&task->ost_task, prio, taskq_run_safe, task);
taskqueue_enqueue(tq->tq_queue, &task->ost_task);
return ((taskqid_t)(void *)task);
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
index 8538b5402787..be9f4ec1dfb7 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
@@ -115,10 +115,10 @@ int
mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
char *fspec, int fsflags)
{
- struct mount *mp;
struct vfsconf *vfsp;
+ struct mount *mp;
+ vnode_t *vp, *mvp;
struct ucred *cr;
- vnode_t *vp;
int error;
/*
@@ -153,8 +153,10 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
/*
* Allocate and initialize the filesystem.
+ * We don't want regular user that triggered snapshot mount to be able
+ * to unmount it, so pass credentials of the parent mount.
*/
- mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
+ mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
mp->mnt_optnew = NULL;
vfs_setmntopt(mp, "from", fspec, 0);
@@ -164,8 +166,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
/*
* Set the mount level flags.
*/
- mp->mnt_flag &= ~MNT_UPDATEMASK;
- mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
+ mp->mnt_flag = fsflags & MNT_UPDATEMASK;
/*
* Snapshots are always read-only.
*/
@@ -176,13 +177,6 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
*/
mp->mnt_flag |= MNT_IGNORE;
/*
- * Unprivileged user can trigger mounting a snapshot, but we don't want
- * him to unmount it, so we switch to privileged of original mount.
- */
- crfree(mp->mnt_cred);
- mp->mnt_cred = crdup(vp->v_mount->mnt_cred);
- mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid;
- /*
* XXX: This is evil, but we can't mount a snapshot as a regular user.
* XXX: Is is safe when snapshot is mounted from within a jail?
*/
@@ -191,17 +185,25 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
error = VFS_MOUNT(mp);
td->td_ucred = cr;
- if (error == 0) {
- if (mp->mnt_opt != NULL)
- vfs_freeopts(mp->mnt_opt);
- mp->mnt_opt = mp->mnt_optnew;
- (void)VFS_STATFS(mp, &mp->mnt_stat);
+ if (error != 0) {
+ vrele(vp);
+ vfs_unbusy(mp);
+ vfs_mount_destroy(mp);
+ *vpp = NULL;
+ return (error);
}
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ (void)VFS_STATFS(mp, &mp->mnt_stat);
+
/*
* Prevent external consumers of mount options from reading
* mnt_optnew.
*/
mp->mnt_optnew = NULL;
+
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef FREEBSD_NAMECACHE
cache_purge(vp);
@@ -209,27 +211,17 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
- if (error == 0) {
- vnode_t *mvp;
-
- vp->v_mountedhere = mp;
- /*
- * Put the new filesystem on the mount list.
- */
- mtx_lock(&mountlist_mtx);
- TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
- mtx_unlock(&mountlist_mtx);
- vfs_event_signal(NULL, VQ_MOUNT, 0);
- if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
- panic("mount: lost mount");
- vput(vp);
- vfs_unbusy(mp);
- *vpp = mvp;
- } else {
- vput(vp);
- vfs_unbusy(mp);
- vfs_mount_destroy(mp);
- *vpp = NULL;
- }
- return (error);
+
+ vp->v_mountedhere = mp;
+ /* Put the new filesystem on the mount list. */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ vfs_event_signal(NULL, VQ_MOUNT, 0);
+ if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
+ panic("mount: lost mount");
+ vput(vp);
+ vfs_unbusy(mp);
+ *vpp = mvp;
+ return (0);
}
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
index f25a67c98430..55b19067e985 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c
@@ -229,6 +229,15 @@ zone_destroy(void *arg)
free(head, M_ZONES);
}
+uint32_t
+zone_get_hostid(void *ptr)
+{
+
+ KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
+
+ return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
+}
+
static void
zone_sysinit(void *arg __unused)
{
diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h
index ddfb32545c68..af9cc5d27e47 100644
--- a/sys/cddl/compat/opensolaris/sys/atomic.h
+++ b/sys/cddl/compat/opensolaris/sys/atomic.h
@@ -41,9 +41,10 @@ extern void atomic_add_64(volatile uint64_t *target, int64_t delta);
extern void atomic_dec_64(volatile uint64_t *target);
#endif
#ifndef __LP64__
-extern void *atomic_cas_ptr(volatile void *target, void *cmp, void *newval);
#endif
#ifndef __sparc64__
+extern uint32_t atomic_cas_32(volatile uint32_t *target, uint32_t cmp,
+ uint32_t newval);
extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp,
uint64_t newval);
#endif
@@ -118,13 +119,20 @@ atomic_inc_64_nv(volatile uint64_t *target)
return (atomic_add_64_nv(target, 1));
}
-#ifdef __LP64__
+#if !defined(COMPAT_32BIT) && defined(__LP64__)
static __inline void *
atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
{
- return ((void *)atomic_cas_64((volatile uint64_t *)target, (uint64_t)cmp,
- (uint64_t)newval));
+ return ((void *)atomic_cas_64((volatile uint64_t *)target,
+ (uint64_t)cmp, (uint64_t)newval));
}
-#endif
+#else
+static __inline void *
+atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+{
+ return ((void *)atomic_cas_32((volatile uint32_t *)target,
+ (uint32_t)cmp, (uint32_t)newval));
+}
+#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */
#endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/byteorder.h b/sys/cddl/compat/opensolaris/sys/byteorder.h
index a8f8bc2569e3..72ae45b34655 100644
--- a/sys/cddl/compat/opensolaris/sys/byteorder.h
+++ b/sys/cddl/compat/opensolaris/sys/byteorder.h
@@ -59,9 +59,31 @@
* Macros to convert from a specific byte order to/from native byte order
*/
#if _BYTE_ORDER == _BIG_ENDIAN
+#define BE_8(x) BMASK_8(x)
+#define BE_16(x) BMASK_16(x)
+#define BE_32(x) BMASK_32(x)
+#define BE_64(x) BMASK_64(x)
+#define LE_8(x) BSWAP_8(x)
+#define LE_16(x) BSWAP_16(x)
+#define LE_32(x) BSWAP_32(x)
#define LE_64(x) BSWAP_64(x)
#else
+#define LE_8(x) BMASK_8(x)
+#define LE_16(x) BMASK_16(x)
+#define LE_32(x) BMASK_32(x)
#define LE_64(x) BMASK_64(x)
+#define BE_8(x) BSWAP_8(x)
+#define BE_16(x) BSWAP_16(x)
+#define BE_32(x) BSWAP_32(x)
+#define BE_64(x) BSWAP_64(x)
+#endif
+
+#if _BYTE_ORDER == _BIG_ENDIAN
+#define htonll(x) BMASK_64(x)
+#define ntohll(x) BMASK_64(x)
+#else
+#define htonll(x) BSWAP_64(x)
+#define ntohll(x) BSWAP_64(x)
#endif
#endif /* _OPENSOLARIS_SYS_BYTEORDER_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/dirent.h b/sys/cddl/compat/opensolaris/sys/dirent.h
index c369b04a6956..d273f8a0fce4 100644
--- a/sys/cddl/compat/opensolaris/sys/dirent.h
+++ b/sys/cddl/compat/opensolaris/sys/dirent.h
@@ -29,11 +29,14 @@
#ifndef _OPENSOLARIS_SYS_DIRENT_H_
#define _OPENSOLARIS_SYS_DIRENT_H_
+#include <sys/types.h>
+
#include_next <sys/dirent.h>
typedef struct dirent dirent64_t;
+typedef ino_t ino64_t;
+
#define dirent64 dirent
-#define ino64_t ino_t
#define d_ino d_fileno
diff --git a/sys/cddl/compat/opensolaris/sys/file.h b/sys/cddl/compat/opensolaris/sys/file.h
index afd10501d016..811b78cf87a6 100644
--- a/sys/cddl/compat/opensolaris/sys/file.h
+++ b/sys/cddl/compat/opensolaris/sys/file.h
@@ -31,26 +31,30 @@
#include_next <sys/file.h>
+#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */
+
#ifdef _KERNEL
typedef struct file file_t;
static __inline file_t *
-getf(int fd, int write)
+getf(int fd)
{
struct file *fp;
- if (write && fget_write(curthread, fd, &fp) == 0)
- return (fp);
- else if (!write && fget_read(curthread, fd, &fp) == 0)
+ if (fget(curthread, fd, &fp) == 0)
return (fp);
return (NULL);
}
static __inline void
-releasef(file_t *fp)
+releasef(int fd)
{
+ struct file *fp;
- fdrop(fp, curthread);
+ if (fget(curthread, fd, &fp) == 0) {
+ fdrop(fp, curthread);
+ fdrop(fp, curthread);
+ }
}
#endif /* _KERNEL */
diff --git a/sys/cddl/compat/opensolaris/sys/kmem.h b/sys/cddl/compat/opensolaris/sys/kmem.h
index c103d18b4e3b..6be273562ca1 100644
--- a/sys/cddl/compat/opensolaris/sys/kmem.h
+++ b/sys/cddl/compat/opensolaris/sys/kmem.h
@@ -37,10 +37,16 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
+MALLOC_DECLARE(M_SOLARIS);
+
+#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
+#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
+
#define KM_SLEEP M_WAITOK
#define KM_PUSHPAGE M_WAITOK
#define KM_NOSLEEP M_NOWAIT
#define KMC_NODEBUG 0
+#define KMC_NOTOUCH 0
typedef struct kmem_cache {
char kc_name[32];
@@ -75,4 +81,6 @@ void *calloc(size_t n, size_t s);
#define kmem_zalloc(size, kmflags) zfs_kmem_alloc((size), (kmflags) | M_ZERO)
#define kmem_free(buf, size) zfs_kmem_free((buf), (size))
+#define kmem_cache_set_move(cache, movefunc) do { } while (0)
+
#endif /* _OPENSOLARIS_SYS_KMEM_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/misc.h b/sys/cddl/compat/opensolaris/sys/misc.h
index 20d335b0674e..e128ce06d165 100644
--- a/sys/cddl/compat/opensolaris/sys/misc.h
+++ b/sys/cddl/compat/opensolaris/sys/misc.h
@@ -29,7 +29,9 @@
#ifndef _OPENSOLARIS_SYS_MISC_H_
#define _OPENSOLARIS_SYS_MISC_H_
-#define MAXUID 2147483647
+#include <sys/limits.h>
+
+#define MAXUID UID_MAX
#define SPEC_MAXOFFSET_T OFF_MAX
@@ -45,8 +47,11 @@
#ifdef _KERNEL
struct opensolaris_utsname {
- char *nodename;
- char *sysname;
+ char *sysname;
+ char *nodename;
+ char *release;
+ char version[32];
+ char *machine;
};
extern char hw_serial[11];
diff --git a/sys/cddl/compat/opensolaris/sys/mount.h b/sys/cddl/compat/opensolaris/sys/mount.h
index d4c40391c8df..e012597a92e6 100644
--- a/sys/cddl/compat/opensolaris/sys/mount.h
+++ b/sys/cddl/compat/opensolaris/sys/mount.h
@@ -29,6 +29,8 @@
#ifndef _OPENSOLARIS_SYS_MOUNT_H_
#define _OPENSOLARIS_SYS_MOUNT_H_
+#include <sys/param.h>
+
#include_next <sys/mount.h>
#define MS_FORCE MNT_FORCE
diff --git a/sys/cddl/compat/opensolaris/sys/mutex.h b/sys/cddl/compat/opensolaris/sys/mutex.h
index f6858a7b161d..ef058918d33f 100644
--- a/sys/cddl/compat/opensolaris/sys/mutex.h
+++ b/sys/cddl/compat/opensolaris/sys/mutex.h
@@ -54,7 +54,7 @@ typedef struct sx kmutex_t;
#define mutex_init(lock, desc, type, arg) do { \
const char *_name; \
- ASSERT((type) == MUTEX_DEFAULT); \
+ ASSERT((type) == 0 || (type) == MUTEX_DEFAULT); \
KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \
LO_EXPECTED, ("lock %s already initialized", #lock)); \
bzero((lock), sizeof(struct sx)); \
diff --git a/sys/cddl/compat/opensolaris/sys/policy.h b/sys/cddl/compat/opensolaris/sys/policy.h
index f61859b557e7..0b968aead205 100644
--- a/sys/cddl/compat/opensolaris/sys/policy.h
+++ b/sys/cddl/compat/opensolaris/sys/policy.h
@@ -36,41 +36,39 @@
#include <sys/vnode.h>
struct mount;
-struct ucred;
struct vattr;
-struct vnode;
-int secpolicy_nfs(struct ucred *cred);
-int secpolicy_zfs(struct ucred *cred);
-int secpolicy_sys_config(struct ucred *cred, int checkonly);
-int secpolicy_zinject(struct ucred *cred);
-int secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp);
-int secpolicy_basic_link(struct vnode *vp, struct ucred *cred);
-int secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner);
-int secpolicy_vnode_chown(struct vnode *vp, cred_t *cred, uid_t owner);
-int secpolicy_vnode_stky_modify(struct ucred *cred);
-int secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred);
-int secpolicy_vnode_access(struct ucred *cred, struct vnode *vp,
- uint64_t owner, accmode_t accmode);
-int secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred,
- uid_t owner);
-int secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp,
- struct vattr *vap, const struct vattr *ovap, int flags,
- int unlocked_access(void *, int, struct ucred *), void *node);
-int secpolicy_vnode_create_gid(struct ucred *cred);
-int secpolicy_vnode_setids_setgids(struct vnode *vp, struct ucred *cred,
- gid_t gid);
-int secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred,
+int secpolicy_nfs(cred_t *cr);
+int secpolicy_zfs(cred_t *crd);
+int secpolicy_sys_config(cred_t *cr, int checkonly);
+int secpolicy_zinject(cred_t *cr);
+int secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp);
+int secpolicy_basic_link(vnode_t *vp, cred_t *cr);
+int secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner);
+int secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner);
+int secpolicy_vnode_stky_modify(cred_t *cr);
+int secpolicy_vnode_remove(vnode_t *vp, cred_t *cr);
+int secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner,
+ accmode_t accmode);
+int secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+ accmode_t curmode, accmode_t wantmode);
+int secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner);
+int secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner);
+int secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, int flags,
+ int unlocked_access(void *, int, cred_t *), void *node);
+int secpolicy_vnode_create_gid(cred_t *cr);
+int secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid);
+int secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
boolean_t issuidroot);
-void secpolicy_setid_clear(struct vattr *vap, struct vnode *vp,
- struct ucred *cred);
-int secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
- const struct vattr *ovap, struct ucred *cred);
-int secpolicy_fs_owner(struct mount *vfsp, struct ucred *cred);
+void secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr);
+int secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+ const struct vattr *ovap, cred_t *cr);
+int secpolicy_fs_owner(struct mount *vfsp, cred_t *cr);
int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp);
void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp);
-int secpolicy_xvattr(struct vnode *vp, xvattr_t *xvap, uid_t owner,
- cred_t *cr, vtype_t vtype);
+int secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+ vtype_t vtype);
int secpolicy_smb(cred_t *cr);
#endif /* _KERNEL */
diff --git a/sys/cddl/compat/opensolaris/sys/proc.h b/sys/cddl/compat/opensolaris/sys/proc.h
index e0b7bc5758e3..9f26f61adf42 100644
--- a/sys/cddl/compat/opensolaris/sys/proc.h
+++ b/sys/cddl/compat/opensolaris/sys/proc.h
@@ -52,6 +52,8 @@
#define p0 proc0
+#define t_tid td_tid
+
typedef short pri_t;
typedef struct thread _kthread;
typedef struct thread kthread_t;
diff --git a/sys/cddl/compat/opensolaris/sys/rwlock.h b/sys/cddl/compat/opensolaris/sys/rwlock.h
index a3e55153bbcf..996a426357b9 100644
--- a/sys/cddl/compat/opensolaris/sys/rwlock.h
+++ b/sys/cddl/compat/opensolaris/sys/rwlock.h
@@ -37,7 +37,6 @@
#ifdef _KERNEL
typedef enum {
- RW_DRIVER = 2, /* driver (DDI) rwlock */
RW_DEFAULT = 4 /* kernel default rwlock */
} krw_type_t;
@@ -61,6 +60,7 @@ typedef struct sx krwlock_t;
#define rw_init(lock, desc, type, arg) do { \
const char *_name; \
+ ASSERT((type) == 0 || (type) == RW_DEFAULT); \
KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \
LO_EXPECTED, ("lock %s already initialized", #lock)); \
bzero((lock), sizeof(struct sx)); \
diff --git a/sys/cddl/compat/opensolaris/sys/sid.h b/sys/cddl/compat/opensolaris/sys/sid.h
index d48b1dffff70..d6c1b0cf1705 100644
--- a/sys/cddl/compat/opensolaris/sys/sid.h
+++ b/sys/cddl/compat/opensolaris/sys/sid.h
@@ -51,11 +51,28 @@ ksiddomain_rele(ksiddomain_t *kd)
kmem_free(kd, sizeof(*kd));
}
-static __inline int
-ksid_getid(void *ksid)
+static __inline uint_t
+ksid_getid(ksid_t *ks)
{
panic("%s has been unexpectedly called", __func__);
}
+static __inline const char *
+ksid_getdomain(ksid_t *ks)
+{
+
+ panic("%s has been unexpectedly called", __func__);
+}
+
+static __inline uint_t
+ksid_getrid(ksid_t *ks)
+{
+
+ panic("%s has been unexpectedly called", __func__);
+}
+
+#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1)
+#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1)
+
#endif /* _OPENSOLARIS_SYS_SID_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/stat.h b/sys/cddl/compat/opensolaris/sys/stat.h
index 5f45ebe08e90..d7301841d08b 100644
--- a/sys/cddl/compat/opensolaris/sys/stat.h
+++ b/sys/cddl/compat/opensolaris/sys/stat.h
@@ -33,6 +33,24 @@
#include_next <sys/stat.h>
#define stat64 stat
-#define fstat64 fstat
+#define MAXOFFSET_T OFF_MAX
+
+#ifndef _KERNEL
+#include <sys/disk.h>
+
+static __inline int
+fstat64(int fd, struct stat *sb)
+{
+ int ret;
+
+ ret = fstat(fd, sb);
+ if (ret == 0) {
+ if (S_ISCHR(sb->st_mode))
+ (void)ioctl(fd, DIOCGMEDIASIZE, &sb->st_size);
+ }
+ return (ret);
+}
#endif
+
+#endif /* !_COMPAT_OPENSOLARIS_SYS_STAT_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/string.h b/sys/cddl/compat/opensolaris/sys/string.h
index aeec929610ed..0d7fb4fa4502 100644
--- a/sys/cddl/compat/opensolaris/sys/string.h
+++ b/sys/cddl/compat/opensolaris/sys/string.h
@@ -32,6 +32,8 @@
#include <sys/libkern.h>
char *strpbrk(const char *, const char *);
-void strident_canon(char *s, size_t n);
+void strident_canon(char *, size_t);
+char *kmem_asprintf(const char *, ...);
+void strfree(char *);
#endif /* _OPENSOLARIS_SYS_STRING_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sunddi.h b/sys/cddl/compat/opensolaris/sys/sunddi.h
index 1ca2bf09fd6d..199701701e1e 100644
--- a/sys/cddl/compat/opensolaris/sys/sunddi.h
+++ b/sys/cddl/compat/opensolaris/sys/sunddi.h
@@ -29,10 +29,39 @@
#ifndef _OPENSOLARIS_SYS_SUNDDI_H_
#define _OPENSOLARIS_SYS_SUNDDI_H_
+#ifdef _KERNEL
+
+#include <sys/kmem.h>
+#include <sys/libkern.h>
+#include <sys/sysevent.h>
+
+#define strdup(ptr) strdup((ptr), M_SOLARIS)
#define ddi_driver_major(zfs_dip) (0)
#define ddi_copyin(from, to, size, flag) (bcopy((from), (to), (size)), 0)
#define ddi_copyout(from, to, size, flag) (bcopy((from), (to), (size)), 0)
int ddi_strtol(const char *str, char **nptr, int base, long *result);
int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result);
+int ddi_strtoull(const char *str, char **nptr, int base,
+ unsigned long long *result);
+
+#define DDI_SUCCESS (0)
+#define DDI_FAILURE (-1)
+#define DDI_SLEEP 0x666
+
+int ddi_soft_state_init(void **statep, size_t size, size_t nitems);
+void ddi_soft_state_fini(void **statep);
+
+void *ddi_get_soft_state(void *state, int item);
+int ddi_soft_state_zalloc(void *state, int item);
+void ddi_soft_state_free(void *state, int item);
+
+int _ddi_log_sysevent(char *vendor, char *class_name, char *subclass_name,
+ nvlist_t *attr_list, sysevent_id_t *eidp, int flag);
+#define ddi_log_sysevent(dip, vendor, class_name, subclass_name, \
+ attr_list, eidp, flag) \
+ _ddi_log_sysevent((vendor), (class_name), (subclass_name), \
+ (attr_list), (eidp), (flag))
+
+#endif /* _KERNEL */
#endif /* _OPENSOLARIS_SYS_SUNDDI_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/sysmacros.h b/sys/cddl/compat/opensolaris/sys/sysmacros.h
deleted file mode 100644
index 0afc9ca90100..000000000000
--- a/sys/cddl/compat/opensolaris/sys/sysmacros.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD$
- */
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _OPENSOLARIS_SYS_SYSMACROS_H_
-#define _OPENSOLARIS_SYS_SYSMACROS_H_
-
-#include <sys/param.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef ABS
-#define ABS(a) ((a) < 0 ? -(a) : (a))
-#endif
-
-#ifndef SIGNOF
-#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0)
-#endif
-
-/*
- * Macro for checking power of 2 address alignment.
- */
-#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
-
-/*
- * Macro to determine if value is a power of 2
- */
-#define ISP2(x) (((x) & ((x) - 1)) == 0)
-
-/*
- * Macros for various sorts of alignment and rounding when the alignment
- * is known to be a power of 2.
- */
-#define P2ALIGN(x, align) ((x) & -(align))
-#define P2PHASE(x, align) ((x) & ((align) - 1))
-#define P2NPHASE(x, align) (-(x) & ((align) - 1))
-#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
-#define P2END(x, align) (-(~(x) & -(align)))
-#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
-#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
-/*
- * Determine whether two numbers have the same high-order bit.
- */
-#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y)))
-
-/*
- * Typed version of the P2* macros. These macros should be used to ensure
- * that the result is correctly calculated based on the data type of (x),
- * which is passed in as the last argument, regardless of the data
- * type of the alignment. For example, if (x) is of type uint64_t,
- * and we want to round it up to a page boundary using "PAGESIZE" as
- * the alignment, we can do either
- * P2ROUNDUP(x, (uint64_t)PAGESIZE)
- * or
- * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
- */
-#define P2ALIGN_TYPED(x, align, type) \
- ((type)(x) & -(type)(align))
-#define P2PHASE_TYPED(x, align, type) \
- ((type)(x) & ((type)(align) - 1))
-#define P2NPHASE_TYPED(x, align, type) \
- (-(type)(x) & ((type)(align) - 1))
-#define P2ROUNDUP_TYPED(x, align, type) \
- (-(-(type)(x) & -(type)(align)))
-#define P2END_TYPED(x, align, type) \
- (-(~(type)(x) & -(type)(align)))
-#define P2PHASEUP_TYPED(x, align, phase, type) \
- ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
-#define P2CROSS_TYPED(x, y, align, type) \
- (((type)(x) ^ (type)(y)) > (type)(align) - 1)
-#define P2SAMEHIGHBIT_TYPED(x, y, type) \
- (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
-
-/*
- * Find highest one bit set.
- * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
- * High order bit is 31 (or 63 in _LP64 kernel).
- */
-static __inline int
-highbit(ulong_t i)
-{
- register int h = 1;
-
- if (i == 0)
- return (0);
-#ifdef _LP64
- if (i & 0xffffffff00000000ul) {
- h += 32; i >>= 32;
- }
-#endif
- if (i & 0xffff0000) {
- h += 16; i >>= 16;
- }
- if (i & 0xff00) {
- h += 8; i >>= 8;
- }
- if (i & 0xf0) {
- h += 4; i >>= 4;
- }
- if (i & 0xc) {
- h += 2; i >>= 2;
- }
- if (i & 0x2) {
- h += 1;
- }
- return (h);
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _OPENSOLARIS_SYS_SYSMACROS_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/systeminfo.h b/sys/cddl/compat/opensolaris/sys/systeminfo.h
new file mode 100644
index 000000000000..df08f452bfec
--- /dev/null
+++ b/sys/cddl/compat/opensolaris/sys/systeminfo.h
@@ -0,0 +1,6 @@
+#ifndef _SYS_SYSTEMINFO_H_
+#define _SYS_SYSTEMINFO_H_
+
+#define HW_HOSTID_LEN 11
+
+#endif /* !_SYS_SYSTEMINFO_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/systm.h b/sys/cddl/compat/opensolaris/sys/systm.h
index d4ef17c750fe..136960ec7a91 100644
--- a/sys/cddl/compat/opensolaris/sys/systm.h
+++ b/sys/cddl/compat/opensolaris/sys/systm.h
@@ -29,10 +29,11 @@
#ifndef _OPENSOLARIS_SYS_SYSTM_H_
#define _OPENSOLARIS_SYS_SYSTM_H_
+#ifdef _KERNEL
+
#include <sys/param.h>
#include_next <sys/systm.h>
-#ifdef _KERNEL
#include <sys/string.h>
#define PAGESIZE PAGE_SIZE
diff --git a/sys/cddl/compat/opensolaris/sys/taskq.h b/sys/cddl/compat/opensolaris/sys/taskq.h
index 9083e6a35948..eedc4da3ff43 100644
--- a/sys/cddl/compat/opensolaris/sys/taskq.h
+++ b/sys/cddl/compat/opensolaris/sys/taskq.h
@@ -39,6 +39,6 @@ struct ostask {
};
taskqid_t taskq_dispatch_safe(taskq_t *tq, task_func_t func, void *arg,
- struct ostask *task);
+ u_int flags, struct ostask *task);
#endif /* _OPENSOLARIS_SYS_TASKQ_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/time.h b/sys/cddl/compat/opensolaris/sys/time.h
index 4275790bebe5..05db50e57ef5 100644
--- a/sys/cddl/compat/opensolaris/sys/time.h
+++ b/sys/cddl/compat/opensolaris/sys/time.h
@@ -38,8 +38,6 @@
typedef longlong_t hrtime_t;
-#define LBOLT ((gethrtime() * hz) / NANOSEC)
-
#if defined(__i386__) || defined(__powerpc__)
#define TIMESPEC_OVERFLOW(ts) \
((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX)
@@ -49,25 +47,23 @@ typedef longlong_t hrtime_t;
#endif
#ifdef _KERNEL
-#define lbolt64 (int64_t)(LBOLT)
-
static __inline hrtime_t
gethrtime(void) {
struct timespec ts;
hrtime_t nsec;
-#if 1
getnanouptime(&ts);
-#else
- nanouptime(&ts);
-#endif
nsec = (hrtime_t)ts.tv_sec * NANOSEC + ts.tv_nsec;
return (nsec);
}
#define gethrestime_sec() (time_second)
#define gethrestime(ts) getnanotime(ts)
+#define gethrtime_waitfree() gethrtime()
+
+#define ddi_get_lbolt() ((gethrtime() * hz) / NANOSEC)
+#define ddi_get_lbolt64() (int64_t)((gethrtime() * hz) / NANOSEC)
#else
@@ -77,7 +73,6 @@ static __inline hrtime_t gethrtime(void) {
return (((u_int64_t) ts.tv_sec) * NANOSEC + ts.tv_nsec);
}
-
#endif /* _KERNEL */
#endif /* !_OPENSOLARIS_SYS_TIME_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/types.h b/sys/cddl/compat/opensolaris/sys/types.h
index aeec27a0255d..4d79b5e24909 100644
--- a/sys/cddl/compat/opensolaris/sys/types.h
+++ b/sys/cddl/compat/opensolaris/sys/types.h
@@ -45,6 +45,7 @@ typedef int64_t clock_t;
#define MAXNAMELEN 256
typedef struct timespec timestruc_t;
+typedef struct timespec timespec_t;
typedef u_int uint_t;
typedef u_char uchar_t;
typedef u_short ushort_t;
@@ -59,6 +60,7 @@ typedef id_t zoneid_t;
typedef id_t ctid_t;
typedef mode_t o_mode_t;
typedef uint64_t pgcnt_t;
+typedef u_int minor_t;
#ifdef _KERNEL
@@ -83,7 +85,6 @@ typedef enum { B_FALSE, B_TRUE } boolean_t;
typedef longlong_t offset_t;
typedef u_longlong_t u_offset_t;
typedef uint64_t upad64_t;
-typedef struct timespec timespec_t;
typedef short pri_t;
typedef int32_t daddr32_t;
typedef int32_t time32_t;
diff --git a/sys/cddl/compat/opensolaris/sys/uio.h b/sys/cddl/compat/opensolaris/sys/uio.h
index c3fa0bcbf015..f0edfb1c0541 100644
--- a/sys/cddl/compat/opensolaris/sys/uio.h
+++ b/sys/cddl/compat/opensolaris/sys/uio.h
@@ -46,10 +46,31 @@ struct uio {
};
#endif
+#define uio_loffset uio_offset
+
typedef struct uio uio_t;
typedef struct iovec iovec_t;
-#define uio_loffset uio_offset
+typedef enum xuio_type {
+ UIOTYPE_ASYNCIO,
+ UIOTYPE_ZEROCOPY
+} xuio_type_t;
+
+typedef struct xuio {
+ uio_t xu_uio;
+
+ /* Extended uio fields */
+ enum xuio_type xu_type; /* What kind of uio structure? */
+ union {
+ struct {
+ int xu_zc_rw;
+ void *xu_zc_priv;
+ } xu_zc;
+ } xu_ext;
+} xuio_t;
+
+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
#ifdef BUILDING_ZFS
static __inline int
diff --git a/sys/cddl/compat/opensolaris/sys/vfs.h b/sys/cddl/compat/opensolaris/sys/vfs.h
index d3a7e38da188..e1e49ed13cf1 100644
--- a/sys/cddl/compat/opensolaris/sys/vfs.h
+++ b/sys/cddl/compat/opensolaris/sys/vfs.h
@@ -121,9 +121,15 @@ typedef uint64_t vfs_feature_t;
#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */
#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */
#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */
-
-#define vfs_set_feature(vfsp, feature) do { } while (0)
-#define vfs_has_feature(vfsp, feature) (0)
+#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */
+#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */
+#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */
+#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200
+ /* Support loaning /returning cache buffer */
+
+#define vfs_set_feature(vfsp, feature) do { } while (0)
+#define vfs_clear_feature(vfsp, feature) do { } while (0)
+#define vfs_has_feature(vfsp, feature) (0)
#endif /* _KERNEL */
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index 926d0349bddf..d653db2f75ba 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -29,6 +29,8 @@
#ifndef _OPENSOLARIS_SYS_VNODE_H_
#define _OPENSOLARIS_SYS_VNODE_H_
+#ifdef _KERNEL
+
struct vnode;
struct vattr;
@@ -60,6 +62,8 @@ typedef struct vop_vector vnodeops_t;
#define V_APPEND VAPPEND
+#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount)
+
static __inline int
vn_is_readonly(vnode_t *vp)
{
@@ -70,8 +74,9 @@ vn_is_readonly(vnode_t *vp)
#define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
#define vn_mountedvfs(vp) ((vp)->v_mountedhere)
#define vn_has_cached_data(vp) \
- ((vp)->v_object != NULL && ((vp)->v_object->resident_page_count > 0 \
- || (vp)->v_object->cache != NULL))
+ ((vp)->v_object != NULL && \
+ ((vp)->v_object->resident_page_count > 0 || \
+ (vp)->v_object->cache != NULL))
#define vn_exists(vp) do { } while (0)
#define vn_invalid(vp) do { } while (0)
#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)
@@ -93,7 +98,8 @@ vn_is_readonly(vnode_t *vp)
#define vnevent_rename_dest_dir(vp, ct) do { } while (0)
#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp))
-#define MANDMODE(mode) (0)
+#define MANDMODE(mode) (0)
+#define MANDLOCK(vp, mode) (0)
#define chklock(vp, op, offset, size, mode, ct) (0)
#define cleanlocks(vp, pid, foo) do { } while (0)
#define cleanshares(vp, pid) do { } while (0)
@@ -143,6 +149,7 @@ vattr_init_mask(vattr_t *vap)
#define FCREAT O_CREAT
#define FTRUNC O_TRUNC
+#define FEXCL O_EXCL
#define FDSYNC FFSYNC
#define FRSYNC FFSYNC
#define FSYNC FFSYNC
@@ -165,7 +172,8 @@ vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode,
ASSERT(crwhy == CRCREAT);
operation = CREATE;
} else {
- ASSERT(filemode == (FREAD | FWRITE | FOFFMAX));
+ ASSERT(filemode == (FREAD | FOFFMAX) ||
+ filemode == (FREAD | FWRITE | FOFFMAX));
ASSERT(crwhy == 0);
operation = LOOKUP;
}
@@ -292,4 +300,6 @@ vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
return (kern_unlink(curthread, fnamep, seg));
}
+#endif /* _KERNEL */
+
#endif /* _OPENSOLARIS_SYS_VNODE_H_ */
diff --git a/sys/cddl/compat/opensolaris/sys/zone.h b/sys/cddl/compat/opensolaris/sys/zone.h
index d761310a1a81..e8eb2c69acfb 100644
--- a/sys/cddl/compat/opensolaris/sys/zone.h
+++ b/sys/cddl/compat/opensolaris/sys/zone.h
@@ -57,6 +57,13 @@ extern int zone_dataset_detach(struct ucred *, const char *, int);
*/
extern int zone_dataset_visible(const char *, int *);
+/*
+ * Safely get the hostid of the specified zone (defaults to machine's hostid
+ * if the specified zone doesn't emulate a hostid). Passing NULL retrieves
+ * the global zone's (i.e., physical system's) hostid.
+ */
+extern uint32_t zone_get_hostid(void *);
+
#else /* !_KERNEL */
#define GLOBAL_ZONEID 0
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
index 5e8de19bae7a..47e0ffdd4a0a 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/avl.h>
@@ -152,166 +149,6 @@ typedef struct ace_list {
int seen; /* bitmask of all aclent_t a_type values seen */
} ace_list_t;
-ace_t trivial_acl[] = {
- {(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
- {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
- ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
- {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
- ACE_ACCESS_DENIED_ACE_TYPE},
- {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
- ACE_ACCESS_ALLOWED_ACE_TYPE},
- {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
- ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
- {(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
- ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
-};
-
-
-void
-adjust_ace_pair_common(void *pair, size_t access_off,
- size_t pairsize, mode_t mode)
-{
- char *datap = (char *)pair;
- uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
- uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
- access_off);
- if (mode & S_IROTH)
- *amask1 |= ACE_READ_DATA;
- else
- *amask0 |= ACE_READ_DATA;
- if (mode & S_IWOTH)
- *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
- else
- *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
- if (mode & S_IXOTH)
- *amask1 |= ACE_EXECUTE;
- else
- *amask0 |= ACE_EXECUTE;
-}
-
-void
-adjust_ace_pair(ace_t *pair, mode_t mode)
-{
- adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask),
- sizeof (ace_t), mode);
-}
-
-static void
-ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny)
-{
- if (type == ACE_ACCESS_ALLOWED_ACE_TYPE)
- *allow = B_TRUE;
- else if (type == ACE_ACCESS_DENIED_ACE_TYPE)
- *deny = B_TRUE;
-}
-
-/*
- * ace_trivial:
- * determine whether an ace_t acl is trivial
- *
- * Trivialness implies that the acl is composed of only
- * owner, group, everyone entries. ACL can't
- * have read_acl denied, and write_owner/write_acl/write_attributes
- * can only be owner@ entry.
- */
-int
-ace_trivial_common(void *acep, int aclcnt,
- uint64_t (*walk)(void *, uint64_t, int aclcnt,
- uint16_t *, uint16_t *, uint32_t *))
-{
- boolean_t owner_allow = B_FALSE;
- boolean_t group_allow = B_FALSE;
- boolean_t everyone_allow = B_FALSE;
- boolean_t owner_deny = B_FALSE;
- boolean_t group_deny = B_FALSE;
- boolean_t everyone_deny = B_FALSE;
- uint16_t flags;
- uint32_t mask;
- uint16_t type;
- uint64_t cookie = 0;
-
- while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
- switch (flags & ACE_TYPE_FLAGS) {
- case ACE_OWNER:
- if (group_allow || group_deny || everyone_allow ||
- everyone_deny)
- return (1);
- ace_allow_deny_helper(type, &owner_allow, &owner_deny);
- break;
- case ACE_GROUP|ACE_IDENTIFIER_GROUP:
- if (everyone_allow || everyone_deny &&
- (!owner_allow && !owner_deny))
- return (1);
- ace_allow_deny_helper(type, &group_allow, &group_deny);
- break;
-
- case ACE_EVERYONE:
- if (!owner_allow && !owner_deny &&
- !group_allow && !group_deny)
- return (1);
- ace_allow_deny_helper(type,
- &everyone_allow, &everyone_deny);
- break;
- default:
- return (1);
-
- }
-
- if (flags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
- ACE_INHERIT_ONLY_ACE))
- return (1);
-
- /*
- * Special check for some special bits
- *
- * Don't allow anybody to deny reading basic
- * attributes or a files ACL.
- */
- if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- (type == ACE_ACCESS_DENIED_ACE_TYPE))
- return (1);
-
- /*
- * Allow on owner@ to allow
- * write_acl/write_owner/write_attributes
- */
- if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
- (!(flags & ACE_OWNER) && (mask &
- (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
- return (1);
-
- }
-
- if (!owner_allow || !owner_deny || !group_allow || !group_deny ||
- !everyone_allow || !everyone_deny)
- return (1);
-
- return (0);
-}
-
-uint64_t
-ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
- uint16_t *type, uint32_t *mask)
-{
- ace_t *acep = datap;
-
- if (cookie >= aclcnt)
- return (0);
-
- *flags = acep[cookie].a_flags;
- *type = acep[cookie].a_type;
- *mask = acep[cookie++].a_access_mask;
-
- return (cookie);
-}
-
-int
-ace_trivial(ace_t *acep, int aclcnt)
-{
- return (ace_trivial_common(acep, aclcnt, ace_walk));
-}
-
/*
* Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
* v = Ptr to array/vector of objs
@@ -1726,4 +1563,198 @@ out:
return (error);
#endif
}
-#endif /* _KERNEL */
+#endif /* !_KERNEL */
+
+#define SET_ACE(acl, index, who, mask, type, flags) { \
+ acl[0][index].a_who = (uint32_t)who; \
+ acl[0][index].a_type = type; \
+ acl[0][index].a_flags = flags; \
+ acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1,
+ uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone)
+{
+ *deny1 = *deny2 = *allow0 = *group = 0;
+
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ *deny1 |= ACE_READ_DATA;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ *deny1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ *deny1 |= ACE_EXECUTE;
+
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ *deny2 = ACE_READ_DATA;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ *deny2 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ *deny2 |= ACE_EXECUTE;
+
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ *allow0 |= ACE_READ_DATA;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ *allow0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ *allow0 |= ACE_EXECUTE;
+
+ *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ *owner |= ACE_READ_DATA;
+ if (mode & S_IWUSR)
+ *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXUSR)
+ *owner |= ACE_EXECUTE;
+
+ *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ *group |= ACE_READ_DATA;
+ if (mode & S_IWGRP)
+ *group |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXGRP)
+ *group |= ACE_EXECUTE;
+
+ *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ *everyone |= ACE_READ_DATA;
+ if (mode & S_IWOTH)
+ *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXOTH)
+ *everyone |= ACE_EXECUTE;
+}
+
+int
+acl_trivial_create(mode_t mode, ace_t **acl, int *count)
+{
+ uint32_t deny1, deny2;
+ uint32_t allow0;
+ uint32_t owner, group, everyone;
+ int index = 0;
+ int error;
+
+ *count = 3;
+ acl_trivial_access_masks(mode, &allow0, &deny1, &deny2, &owner, &group,
+ &everyone);
+
+ if (allow0)
+ (*count)++;
+ if (deny1)
+ (*count)++;
+ if (deny2)
+ (*count)++;
+
+ if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+ return (error);
+
+ if (allow0) {
+ SET_ACE(acl, index, -1, allow0, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_OWNER);
+ }
+ if (deny1) {
+ SET_ACE(acl, index, -1, deny1, ACE_ACCESS_DENIED_ACE_TYPE,
+ ACE_OWNER);
+ }
+ if (deny2) {
+ SET_ACE(acl, index, -1, deny2, ACE_ACCESS_DENIED_ACE_TYPE,
+ ACE_GROUP|ACE_IDENTIFIER_GROUP);
+ }
+
+ SET_ACE(acl, index, -1, owner, ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+ SET_ACE(acl, index, -1, group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ SET_ACE(acl, index, -1, everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_EVERYONE);
+
+ return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permissions are never set by default
+ */
+ if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+ return (1);
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+ return (0);
+}
+
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+ uint16_t *type, uint32_t *mask)
+{
+ ace_t *acep = datap;
+
+ if (cookie >= aclcnt)
+ return (0);
+
+ *flags = acep[cookie].a_flags;
+ *type = acep[cookie].a_type;
+ *mask = acep[cookie++].a_access_mask;
+
+ return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
index 85ba0bdd5e9a..20be9a08d637 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
@@ -19,16 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ACL_COMMON_H
#define _ACL_COMMON_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
#include <sys/types.h>
#include <sys/acl.h>
#include <sys/stat.h>
@@ -51,10 +47,12 @@ extern acl_t *acl_alloc(acl_type_t);
extern void acl_free(acl_t *aclp);
extern int acl_translate(acl_t *aclp, int target_flavor,
int isdir, uid_t owner, gid_t group);
+#endif /* !_KERNEL */
void ksort(caddr_t v, int n, int s, int (*f)());
int cmp2acls(void *a, void *b);
-
-#endif /* _KERNEL */
+int acl_trivial_create(mode_t mode, ace_t **acl, int *count);
+void acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1,
+ uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S
index 6851086c1f96..6d0a1f8fbee8 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
.file "atomic.s"
@@ -30,14 +29,10 @@
#include <sys/asm_linkage.h>
ENTRY(atomic_add_64_nv)
- movq (%rdi), %rax
-1:
- movq %rsi, %rcx
- addq %rax, %rcx
+ mov %rsi, %rax // %rax = delta addend
lock
- cmpxchgq %rcx, (%rdi)
- jne 1b
- movq %rcx, %rax
+ xaddq %rsi, (%rdi) // %rsi = old value, (%rdi) = sum
+ addq %rsi, %rax // new value = original value + delta
ret
SET_SIZE(atomic_add_64_nv)
@@ -53,6 +48,13 @@
ret
SET_SIZE(atomic_or_8_nv)
+ ENTRY(atomic_cas_32)
+ movl %esi, %eax
+ lock
+ cmpxchgl %edx, (%rdi)
+ ret
+ SET_SIZE(atomic_cas_32)
+
ENTRY(atomic_cas_64)
movq %rsi, %rax
lock
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
index 57f7d0a47b53..c9dbd7925ea2 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -102,14 +102,14 @@
ret
SET_SIZE(atomic_or_8_nv)
- ENTRY(atomic_cas_ptr)
+ ENTRY(atomic_cas_32)
movl 4(%esp), %edx
movl 8(%esp), %eax
movl 12(%esp), %ecx
lock
cmpxchgl %ecx, (%edx)
ret
- SET_SIZE(atomic_cas_ptr)
+ SET_SIZE(atomic_cas_32)
ENTRY(atomic_cas_64)
pushl %ebx
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S
index 1b7c580c3b2c..3324fe30e719 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S
@@ -31,6 +31,17 @@
.text
/*
+ * uint32_t atomic_cas_32(volatile uint32_t *p, uint32_t cmp, uint32_t v)
+ */
+ENTRY(atomic_cas_32, 3)
+ mov ar.ccv = r33
+ ;;
+ cmpxchg4.acq r8 = [r32], r34, ar.ccv
+ ;;
+ br.ret.sptk rp
+END(atomic_cas_64)
+
+/*
* uint64_t atomic_cas_64(volatile uint64_t *p, uint64_t cmp, uint64_t v)
*/
ENTRY(atomic_cas_64, 3)
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S
index 0004f44242c4..0ab5d19c5a68 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S
@@ -36,6 +36,19 @@ ENTRY(atomic_add_64_nv)
mr %r3,%r5
blr
+ENTRY(atomic_cas_32)
+ 1: lwarx %r6,0,%r3
+ cmplw %r6,%r4
+ bne 2f
+ stwcx. %r5,0,%r3
+ bne- 1b
+ b 3f
+
+ 2: stwcx. %r6,0,%r3 /* clear reservation */
+
+ 3: mr %r3,%r6
+ blr
+
ENTRY(atomic_cas_64)
1: ldarx %r6,0,%r3
cmpld %r6,%r4
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S
index 5651ae0af0d1..3a498fe8963e 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S
@@ -77,7 +77,6 @@ add_64:
ENTRY(atomic_or_8)
ALTENTRY(atomic_or_8_nv)
ALTENTRY(atomic_or_uchar)
- ALTENTRY(atomic_or_uchar_nv)
and %o0, 0x3, %o4 ! %o4 = byte offset, left-to-right
xor %o4, 0x3, %g1 ! %g1 = byte offset, right-to-left
sll %g1, 3, %g1 ! %g1 = bit offset, right-to-left
@@ -97,7 +96,6 @@ add_64:
and %o5, %o3, %o5
retl
srl %o5, %g1, %o0 ! %o0 = new value
- SET_SIZE(atomic_or_uchar_nv)
SET_SIZE(atomic_or_uchar)
SET_SIZE(atomic_or_8_nv)
SET_SIZE(atomic_or_8)
diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c
index 01aa3cb2fa9d..6106ef176685 100644
--- a/sys/cddl/contrib/opensolaris/common/avl/avl.c
+++ b/sys/cddl/contrib/opensolaris/common/avl/avl.c
@@ -19,13 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* AVL - generic AVL tree implementation for kernel use
*
@@ -243,7 +240,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
* "void *" of the found tree node
*/
void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
{
avl_node_t *node;
avl_node_t *prev = NULL;
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
index eb824c7ef173..53ed5f28f19b 100644
--- a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
@@ -20,12 +20,9 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/debug.h>
#include <sys/nvpair.h>
#include <sys/nvpair_impl.h>
@@ -39,6 +36,7 @@
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#endif
#ifndef offsetof
@@ -254,6 +252,12 @@ nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
nvl->nvl_pad = 0;
}
+uint_t
+nvlist_nvflag(nvlist_t *nvl)
+{
+ return (nvl->nvl_nvflag);
+}
+
/*
* nvlist_alloc - Allocate nvlist.
*/
@@ -687,6 +691,18 @@ nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
return (ENOENT);
}
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ nvp_buf_unlink(nvl, nvp);
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (0);
+}
+
/*
* This function calculates the size of an nvpair value.
*
@@ -1157,6 +1173,42 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
return (curr != NULL ? &curr->nvi_nvp : NULL);
}
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ if (nvp == NULL)
+ curr = priv->nvp_last;
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+ curr = curr->nvi_prev;
+ else
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_TRUE);
+
+ return (priv->nvp_list == NULL);
+}
+
char *
nvpair_name(nvpair_t *nvp)
{
diff --git a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
index 73cf74a4d159..07186b30c830 100644
--- a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
+++ b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
@@ -42,6 +42,7 @@
#include <sys/systm.h>
#include <sys/debug.h>
#include <sys/kmem.h>
+#include <sys/sunddi.h>
#else
#include <strings.h>
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
index 74517a3f6920..5df687662111 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file is intended for functions that ought to be common between user
* land (libzfs) and the kernel. When many common routines need to be shared
@@ -33,11 +30,14 @@
#if defined(_KERNEL)
#include <sys/systm.h>
+#else
+#include <string.h>
#endif
#include <sys/types.h>
#include <sys/fs/zfs.h>
#include <sys/nvpair.h>
+#include "zfs_comutil.h"
/*
* Are there allocatable vdevs?
@@ -63,3 +63,139 @@ zfs_allocatable_devs(nvlist_t *nv)
}
return (B_FALSE);
}
+
+void
+zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp)
+{
+ nvlist_t *policy;
+ nvpair_t *elem;
+ char *nm;
+
+ /* Defaults */
+ zrpp->zrp_request = ZPOOL_NO_REWIND;
+ zrpp->zrp_maxmeta = 0;
+ zrpp->zrp_maxdata = UINT64_MAX;
+ zrpp->zrp_txg = UINT64_MAX;
+
+ if (nvl == NULL)
+ return;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ nm = nvpair_name(elem);
+ if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) {
+ if (nvpair_value_nvlist(elem, &policy) == 0)
+ zpool_get_rewind_policy(policy, zrpp);
+ return;
+ } else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) {
+ if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0)
+ if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES)
+ zrpp->zrp_request = ZPOOL_NO_REWIND;
+ } else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) {
+ (void) nvpair_value_uint64(elem, &zrpp->zrp_txg);
+ } else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta);
+ } else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata);
+ }
+ }
+ if (zrpp->zrp_request == 0)
+ zrpp->zrp_request = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+ int version_zpl;
+ int version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+ {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_FUID, SPA_VERSION_FUID},
+ {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+ {ZPL_VERSION_SA, SPA_VERSION_SA},
+ {0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_spa; i++) {
+ if (spa_version >= zfs_version_table[i].version_spa)
+ version = zfs_version_table[i].version_zpl;
+ }
+
+ return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_zpl; i++) {
+ if (zfs_version_table[i].version_zpl >= zpl_version)
+ return (zfs_version_table[i].version_spa);
+ }
+
+ return (version);
+}
+
+const char *zfs_history_event_names[LOG_END] = {
+ "invalid event",
+ "pool create",
+ "vdev add",
+ "pool remove",
+ "pool destroy",
+ "pool export",
+ "pool import",
+ "vdev attach",
+ "vdev replace",
+ "vdev detach",
+ "vdev online",
+ "vdev offline",
+ "vdev upgrade",
+ "pool clear",
+ "pool scrub",
+ "pool property set",
+ "create",
+ "clone",
+ "destroy",
+ "destroy_begin_sync",
+ "inherit",
+ "property set",
+ "quota set",
+ "permission update",
+ "permission remove",
+ "permission who remove",
+ "promote",
+ "receive",
+ "rename",
+ "reservation set",
+ "replay_inc_sync",
+ "replay_full_sync",
+ "rollback",
+ "snapshot",
+ "filesystem version upgrade",
+ "refquota set",
+ "refreservation set",
+ "pool scrub done",
+ "user hold",
+ "user release",
+ "pool split",
+};
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
index f517044a80a0..61327f9aa909 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZFS_COMUTIL_H
#define _ZFS_COMUTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/fs/zfs.h>
#include <sys/types.h>
@@ -35,7 +32,12 @@
extern "C" {
#endif
-extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
+
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+extern const char *zfs_history_event_names[LOG_END];
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
index 2964cae5db8e..18681035d6e1 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
@@ -19,8 +19,8 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
*/
#if defined(_KERNEL)
@@ -61,12 +61,15 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
{ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
- {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
+ {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
{ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
{ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+ {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+ {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+ {ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
{NULL, ZFS_DELEG_NOTE_NONE }
};
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
index cdbbd83de07e..9997dffae7d0 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
@@ -19,8 +19,8 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _ZFS_DELEG_H
@@ -52,6 +52,7 @@ typedef enum {
ZFS_DELEG_NOTE_CLONE,
ZFS_DELEG_NOTE_PROMOTE,
ZFS_DELEG_NOTE_RENAME,
+ ZFS_DELEG_NOTE_SEND,
ZFS_DELEG_NOTE_RECEIVE,
ZFS_DELEG_NOTE_ALLOW,
ZFS_DELEG_NOTE_USERPROP,
@@ -61,6 +62,9 @@ typedef enum {
ZFS_DELEG_NOTE_GROUPQUOTA,
ZFS_DELEG_NOTE_USERUSED,
ZFS_DELEG_NOTE_GROUPUSED,
+ ZFS_DELEG_NOTE_HOLD,
+ ZFS_DELEG_NOTE_RELEASE,
+ ZFS_DELEG_NOTE_DIFF,
ZFS_DELEG_NOTE_NONE
} zfs_deleg_note_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
index 54247d724d49..fa43ce6bdb5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
@@ -128,6 +128,7 @@
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/byteorder.h>
+#include <sys/zio.h>
#include <sys/spa.h>
void
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
new file mode 100644
index 000000000000..b49df0cf4f0f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
@@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_FLETCHER_H
+#define _ZFS_FLETCHER_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * fletcher checksum functions
+ */
+
+void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_incremental_native(const void *, uint64_t,
+ zio_cksum_t *);
+void fletcher_4_incremental_byteswap(const void *, uint64_t,
+ zio_cksum_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_FLETCHER_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
new file mode 100644
index 000000000000..d830fd904a75
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
@@ -0,0 +1,349 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+#include "zfs_ioctl_compat.h"
+
+/*
+ * FreeBSD zfs_cmd compatibility with v15 and older binaries
+ * appropriately remap/extend the zfs_cmd_t structure
+ */
+void
+zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
+{
+ zfs_cmd_v15_t *zc_c;
+
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ zc_c = (void *)addr;
+
+ /* zc */
+ strlcpy(zc->zc_name,zc_c->zc_name,MAXPATHLEN);
+ strlcpy(zc->zc_value,zc_c->zc_value,MAXPATHLEN);
+ strlcpy(zc->zc_string,zc_c->zc_string,MAXPATHLEN);
+ zc->zc_guid = zc_c->zc_guid;
+ zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
+ zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
+ zc->zc_nvlist_src = zc_c->zc_nvlist_src;
+ zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size;
+ zc->zc_nvlist_dst = zc_c->zc_nvlist_dst;
+ zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size;
+ zc->zc_cookie = zc_c->zc_cookie;
+ zc->zc_objset_type = zc_c->zc_objset_type;
+ zc->zc_perm_action = zc_c->zc_perm_action;
+ zc->zc_history = zc_c->zc_history;
+ zc->zc_history_len = zc_c->zc_history_len;
+ zc->zc_history_offset = zc_c->zc_history_offset;
+ zc->zc_obj = zc_c->zc_obj;
+ zc->zc_share = zc_c->zc_share;
+ zc->zc_jailid = zc_c->zc_jailid;
+ zc->zc_objset_stats = zc_c->zc_objset_stats;
+ zc->zc_begin_record = zc_c->zc_begin_record;
+
+ /* zc->zc_inject_record */
+ zc->zc_inject_record.zi_objset =
+ zc_c->zc_inject_record.zi_objset;
+ zc->zc_inject_record.zi_object =
+ zc_c->zc_inject_record.zi_object;
+ zc->zc_inject_record.zi_start =
+ zc_c->zc_inject_record.zi_start;
+ zc->zc_inject_record.zi_end =
+ zc_c->zc_inject_record.zi_end;
+ zc->zc_inject_record.zi_guid =
+ zc_c->zc_inject_record.zi_guid;
+ zc->zc_inject_record.zi_level =
+ zc_c->zc_inject_record.zi_level;
+ zc->zc_inject_record.zi_error =
+ zc_c->zc_inject_record.zi_error;
+ zc->zc_inject_record.zi_type =
+ zc_c->zc_inject_record.zi_type;
+ zc->zc_inject_record.zi_freq =
+ zc_c->zc_inject_record.zi_freq;
+ zc->zc_inject_record.zi_failfast =
+ zc_c->zc_inject_record.zi_failfast;
+ }
+}
+
+void
+zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int cflag)
+{
+ zfs_cmd_v15_t *zc_c;
+
+ switch (cflag) {
+ case ZFS_CMD_COMPAT_V15:
+ zc_c = (void *)addr;
+
+ /* zc */
+ strlcpy(zc_c->zc_name,zc->zc_name,MAXPATHLEN);
+ strlcpy(zc_c->zc_value,zc->zc_value,MAXPATHLEN);
+ strlcpy(zc_c->zc_string,zc->zc_string,MAXPATHLEN);
+ zc_c->zc_guid = zc->zc_guid;
+ zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+ zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+ zc_c->zc_nvlist_src = zc->zc_nvlist_src;
+ zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+ zc_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+ zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+ zc_c->zc_cookie = zc->zc_cookie;
+ zc_c->zc_objset_type = zc->zc_objset_type;
+ zc_c->zc_perm_action = zc->zc_perm_action;
+ zc_c->zc_history = zc->zc_history;
+ zc_c->zc_history_len = zc->zc_history_len;
+ zc_c->zc_history_offset = zc->zc_history_offset;
+ zc_c->zc_obj = zc->zc_obj;
+ zc_c->zc_share = zc->zc_share;
+ zc_c->zc_jailid = zc->zc_jailid;
+ zc_c->zc_objset_stats = zc->zc_objset_stats;
+ zc_c->zc_begin_record = zc->zc_begin_record;
+
+ /* zc_inject_record */
+ zc_c->zc_inject_record.zi_objset =
+ zc->zc_inject_record.zi_objset;
+ zc_c->zc_inject_record.zi_object =
+ zc->zc_inject_record.zi_object;
+ zc_c->zc_inject_record.zi_start =
+ zc->zc_inject_record.zi_start;
+ zc_c->zc_inject_record.zi_end =
+ zc->zc_inject_record.zi_end;
+ zc_c->zc_inject_record.zi_guid =
+ zc->zc_inject_record.zi_guid;
+ zc_c->zc_inject_record.zi_level =
+ zc->zc_inject_record.zi_level;
+ zc_c->zc_inject_record.zi_error =
+ zc->zc_inject_record.zi_error;
+ zc_c->zc_inject_record.zi_type =
+ zc->zc_inject_record.zi_type;
+ zc_c->zc_inject_record.zi_freq =
+ zc->zc_inject_record.zi_freq;
+ zc_c->zc_inject_record.zi_failfast =
+ zc->zc_inject_record.zi_failfast;
+
+ break;
+ }
+}
+
+static int
+zfs_ioctl_compat_write_nvlist_dst(zfs_cmd_t *zc, nvlist_t *nvl, size_t nvsize)
+{
+ char *packed = (void *)(uintptr_t)zc->zc_nvlist_dst;
+ int err;
+
+ err = nvlist_pack(nvl, &packed, &nvsize,
+ NV_ENCODE_NATIVE, 0);
+ if (err == 0)
+ zc->zc_nvlist_dst_size = nvsize;
+
+ return (err);
+}
+
+static void
+zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl)
+{
+ nvlist_t **child;
+ nvlist_t *nvroot = NULL;
+ vdev_stat_t *vs;
+ uint_t c, children, nelem;
+
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ zfs_ioctl_compat_fix_stats_nvlist(child[c]);
+ }
+ }
+
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0)
+ zfs_ioctl_compat_fix_stats_nvlist(nvroot);
+#ifdef _KERNEL
+ if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+ if ((nvlist_lookup_uint64_array(nvl, "stats",
+#endif
+
+ (uint64_t **)&vs, &nelem) == 0)) {
+ nvlist_add_uint64_array(nvl,
+#ifdef _KERNEL
+ "stats",
+#else
+ ZPOOL_CONFIG_VDEV_STATS,
+#endif
+ (uint64_t *)vs, nelem);
+#ifdef _KERNEL
+ nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+ nvlist_remove(nvl, "stats",
+#endif
+ DATA_TYPE_UINT64_ARRAY);
+ }
+}
+
+static void
+zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int cflag)
+{
+ nvlist_t *nv, *nvp = NULL;
+ nvpair_t *elem;
+ size_t nvsize;
+ char *packed;
+
+ if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, &nv, 0) != 0)
+ return;
+
+ if (cflag == 5) { /* ZFS_IOC_POOL_STATS */
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
+ if (nvpair_value_nvlist(elem, &nvp) == 0)
+ zfs_ioctl_compat_fix_stats_nvlist(nvp);
+ }
+ elem = NULL;
+ } else
+ zfs_ioctl_compat_fix_stats_nvlist(nv);
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_NATIVE) == 0);
+ zfs_ioctl_compat_write_nvlist_dst(zc, nv, nvsize);
+
+ nvlist_free(nv);
+}
+
+static void
+zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc)
+{
+ nvlist_t *nv, *nva = NULL;
+ size_t nvsize;
+
+ if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, &nv, 0) != 0)
+ return;
+
+#ifdef _KERNEL
+ if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) {
+ nvlist_add_nvlist(nv, "used", nva);
+ nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST);
+ }
+
+ if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) {
+ nvlist_add_nvlist(nv, "available", nva);
+ nvlist_remove(nv, "free", DATA_TYPE_NVLIST);
+ }
+#else
+ if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) {
+ nvlist_add_nvlist(nv, "allocated", nva);
+ nvlist_remove(nv, "used", DATA_TYPE_NVLIST);
+ }
+
+ if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) {
+ nvlist_add_nvlist(nv, "free", nva);
+ nvlist_remove(nv, "available", DATA_TYPE_NVLIST);
+ }
+#endif
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_NATIVE) == 0);
+ zfs_ioctl_compat_write_nvlist_dst(zc, nv, nvsize);
+
+ nvlist_free(nv);
+}
+
+#ifndef _KERNEL
+int
+zcmd_ioctl_compat(int fd, unsigned long cmd, zfs_cmd_t *zc, const int cflag)
+{
+ int nc, ret;
+ void *zc_c;
+ unsigned long ncmd;
+
+ if (cflag == ZFS_CMD_COMPAT_NONE) {
+ ret = ioctl(fd, cmd, zc);
+ return (ret);
+ }
+
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ nc = zfs_ioctl_v28_to_v15[ZFS_IOC(cmd)];
+ zc_c = malloc(sizeof(zfs_cmd_v15_t));
+ ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
+ } else
+ return (EINVAL);
+
+ if (ZFS_IOC(ncmd) == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+
+ zfs_cmd_compat_put(zc, (caddr_t)zc_c, cflag);
+ ret = ioctl(fd, ncmd, zc_c);
+ if (cflag == ZFS_CMD_COMPAT_V15 &&
+ nc == 2 /* ZFS_IOC_POOL_IMPORT */)
+ ret = ioctl(fd, _IOWR('Z', 4 /* ZFS_IOC_POOL_CONFIGS */,
+ struct zfs_cmd_v15), zc_c);
+ zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
+ free(zc_c);
+
+ switch (nc) {
+ case 2: /* ZFS_IOC_POOL_IMPORT */
+ case 4: /* ZFS_IOC_POOL_CONFIGS */
+ case 5: /* ZFS_IOC_POOL_STATS */
+ case 6: /* ZFS_IOC_POOL_TRYIMPORT */
+ zfs_ioctl_compat_fix_stats(zc, nc);
+ break;
+ case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+ zfs_ioctl_compat_pool_get_props(zc);
+ break;
+ }
+
+ return (ret);
+}
+#else /* _KERNEL */
+void
+zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag)
+{
+ if (cflag == ZFS_CMD_COMPAT_V15)
+ switch (*vec) {
+
+ case 7: /* ZFS_IOC_POOL_SCRUB (v15) */
+ zc->zc_cookie = POOL_SCAN_SCRUB;
+ break;
+ }
+}
+
+void
+zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag)
+{
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ switch (vec) {
+ case 4: /* ZFS_IOC_POOL_CONFIGS */
+ case 5: /* ZFS_IOC_POOL_STATS */
+ case 6: /* ZFS_IOC_POOL_TRYIMPORT */
+ zfs_ioctl_compat_fix_stats(zc, vec);
+ break;
+ case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+ zfs_ioctl_compat_pool_get_props(zc);
+ break;
+ }
+ }
+}
+#endif /* KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
new file mode 100644
index 000000000000..03d648c9ea18
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
@@ -0,0 +1,223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_COMPAT_H
+#define _SYS_ZFS_IOCTL_COMPAT_H
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CMD_COMPAT_NONE 0
+#define ZFS_CMD_COMPAT_V15 1
+
+#define ZFS_IOC_COMPAT_PASS 254
+#define ZFS_IOC_COMPAT_FAIL 255
+
+typedef struct zinject_record_v15 {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+} zinject_record_v15_t;
+
+typedef struct zfs_cmd_v15 {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_v15_t zc_inject_record;
+} zfs_cmd_v15_t;
+
+#ifdef _KERNEL
+unsigned static long zfs_ioctl_v15_to_v28[] = {
+ 0, /* 0 ZFS_IOC_POOL_CREATE */
+ 1, /* 1 ZFS_IOC_POOL_DESTROY */
+ 2, /* 2 ZFS_IOC_POOL_IMPORT */
+ 3, /* 3 ZFS_IOC_POOL_EXPORT */
+ 4, /* 4 ZFS_IOC_POOL_CONFIGS */
+ 5, /* 5 ZFS_IOC_POOL_STATS */
+ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
+ 7, /* 7 ZFS_IOC_POOL_SCRUB */
+ 8, /* 8 ZFS_IOC_POOL_FREEZE */
+ 9, /* 9 ZFS_IOC_POOL_UPGRADE */
+ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
+ 11, /* 11 ZFS_IOC_VDEV_ADD */
+ 12, /* 12 ZFS_IOC_VDEV_REMOVE */
+ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
+ 14, /* 14 ZFS_IOC_VDEV_ATTACH */
+ 15, /* 15 ZFS_IOC_VDEV_DETACH */
+ 16, /* 16 ZFS_IOC_VDEV_SETPATH */
+ 18, /* 17 ZFS_IOC_OBJSET_STATS */
+ 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */
+ 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */
+ 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+ 22, /* 21 ZFS_IOC_SET_PROP */
+ ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */
+ ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */
+ 23, /* 24 ZFS_IOC_CREATE */
+ 24, /* 25 ZFS_IOC_DESTROY */
+ 25, /* 26 ZFS_IOC_ROLLBACK */
+ 26, /* 27 ZFS_IOC_RENAME */
+ 27, /* 28 ZFS_IOC_RECV */
+ 28, /* 29 ZFS_IOC_SEND */
+ 29, /* 30 ZFS_IOC_INJECT_FAULT */
+ 30, /* 31 ZFS_IOC_CLEAR_FAULT */
+ 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */
+ 32, /* 33 ZFS_IOC_ERROR_LOG */
+ 33, /* 34 ZFS_IOC_CLEAR */
+ 34, /* 35 ZFS_IOC_PROMOTE */
+ 35, /* 36 ZFS_IOC_DESTROY_SNAPS */
+ 36, /* 37 ZFS_IOC_SNAPSHOT */
+ 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */
+ 38, /* 39 ZFS_IOC_OBJ_TO_PATH */
+ 39, /* 40 ZFS_IOC_POOL_SET_PROPS */
+ 40, /* 41 ZFS_IOC_POOL_GET_PROPS */
+ 41, /* 42 ZFS_IOC_SET_FSACL */
+ 42, /* 43 ZFS_IOC_GET_FSACL */
+ ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */
+ 43, /* 45 ZFS_IOC_SHARE */
+ 44, /* 46 ZFS_IOC_IHNERIT_PROP */
+ 58, /* 47 ZFS_IOC_JAIL */
+ 59, /* 48 ZFS_IOC_UNJAIL */
+ 45, /* 49 ZFS_IOC_SMB_ACL */
+ 46, /* 50 ZFS_IOC_USERSPACE_ONE */
+ 47, /* 51 ZFS_IOC_USERSPACE_MANY */
+ 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */
+ 17, /* 53 ZFS_IOC_SETFRU */
+};
+
+#else /* KERNEL */
+unsigned static long zfs_ioctl_v28_to_v15[] = {
+ 0, /* 0 ZFS_IOC_POOL_CREATE */
+ 1, /* 1 ZFS_IOC_POOL_DESTROY */
+ 2, /* 2 ZFS_IOC_POOL_IMPORT */
+ 3, /* 3 ZFS_IOC_POOL_EXPORT */
+ 4, /* 4 ZFS_IOC_POOL_CONFIGS */
+ 5, /* 5 ZFS_IOC_POOL_STATS */
+ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
+ 7, /* 7 ZFS_IOC_POOL_SCAN */
+ 8, /* 8 ZFS_IOC_POOL_FREEZE */
+ 9, /* 9 ZFS_IOC_POOL_UPGRADE */
+ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
+ 11, /* 11 ZFS_IOC_VDEV_ADD */
+ 12, /* 12 ZFS_IOC_VDEV_REMOVE */
+ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
+ 14, /* 14 ZFS_IOC_VDEV_ATTACH */
+ 15, /* 15 ZFS_IOC_VDEV_DETACH */
+ 16, /* 16 ZFS_IOC_VDEV_SETPATH */
+ 53, /* 17 ZFS_IOC_VDEV_SETFRU */
+ 17, /* 18 ZFS_IOC_OBJSET_STATS */
+ 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */
+ 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */
+ 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+ 21, /* 22 ZFS_IOC_SET_PROP */
+ 24, /* 23 ZFS_IOC_CREATE */
+ 25, /* 24 ZFS_IOC_DESTROY */
+ 26, /* 25 ZFS_IOC_ROLLBACK */
+ 27, /* 26 ZFS_IOC_RENAME */
+ 28, /* 27 ZFS_IOC_RECV */
+ 29, /* 28 ZFS_IOC_SEND */
+ 30, /* 39 ZFS_IOC_INJECT_FAULT */
+ 31, /* 30 ZFS_IOC_CLEAR_FAULT */
+ 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */
+ 33, /* 32 ZFS_IOC_ERROR_LOG */
+ 34, /* 33 ZFS_IOC_CLEAR */
+ 35, /* 34 ZFS_IOC_PROMOTE */
+ 36, /* 35 ZFS_IOC_DESTROY_SNAPS */
+ 37, /* 36 ZFS_IOC_SNAPSHOT */
+ 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */
+ 39, /* 38 ZFS_IOC_OBJ_TO_PATH */
+ 40, /* 39 ZFS_IOC_POOL_SET_PROPS */
+ 41, /* 40 ZFS_IOC_POOL_GET_PROPS */
+ 42, /* 41 ZFS_IOC_SET_FSACL */
+ 43, /* 42 ZFS_IOC_GET_FSACL */
+ 45, /* 43 ZFS_IOC_SHARE */
+ 46, /* 44 ZFS_IOC_IHNERIT_PROP */
+ 49, /* 45 ZFS_IOC_SMB_ACL */
+ 50, /* 46 ZFS_IOC_USERSPACE_ONE */
+ 51, /* 47 ZFS_IOC_USERSPACE_MANY */
+ 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */
+ ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */
+ ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */
+ ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */
+ ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */
+ ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */
+ ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */
+ ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */
+ ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */
+ ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */
+ 47, /* 58 ZFS_IOC_JAIL */
+ 48, /* 59 ZFS_IOC_UNJAIL */
+};
+#endif /* ! _KERNEL */
+
+#ifdef _KERNEL
+void zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int);
+void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int);
+#else
+int zcmd_ioctl_compat(int, unsigned long, zfs_cmd_t *, const int);
+#endif /* _KERNEL */
+void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int);
+void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_COMPAT_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
index 45730c6fc4bd..5cfafea471b3 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -59,7 +59,7 @@ valid_char(char c)
* Snapshot names must be made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.: ]
*/
int
snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index fa98192aa50e..434b4829d921 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/u8_textprep.h>
@@ -69,6 +70,16 @@ zfs_prop_init(void)
{ NULL }
};
+ static zprop_index_t dedup_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "sha256,verify",
+ ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+ { NULL }
+ };
+
static zprop_index_t compress_table[] = {
{ "on", ZIO_COMPRESS_ON },
{ "off", ZIO_COMPRESS_OFF },
@@ -83,6 +94,7 @@ zfs_prop_init(void)
{ "gzip-7", ZIO_COMPRESS_GZIP_7 },
{ "gzip-8", ZIO_COMPRESS_GZIP_8 },
{ "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { "zle", ZIO_COMPRESS_ZLE },
{ NULL }
};
@@ -92,13 +104,6 @@ zfs_prop_init(void)
{ NULL }
};
- static zprop_index_t acl_mode_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "groupmask", ZFS_ACL_GROUPMASK },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
- };
-
static zprop_index_t acl_inherit_table[] = {
{ "discard", ZFS_ACL_DISCARD },
{ "noallow", ZFS_ACL_NOALLOW },
@@ -142,6 +147,7 @@ zfs_prop_init(void)
{ "2", 2 },
{ "3", 3 },
{ "4", 4 },
+ { "5", 5 },
{ "current", ZPL_VERSION },
{ NULL }
};
@@ -152,6 +158,12 @@ zfs_prop_init(void)
{ NULL }
};
+ static zprop_index_t logbias_table[] = {
+ { "latency", ZFS_LOGBIAS_LATENCY },
+ { "throughput", ZFS_LOGBIAS_THROUGHPUT },
+ { NULL }
+ };
+
static zprop_index_t canmount_table[] = {
{ "off", ZFS_CANMOUNT_OFF },
{ "on", ZFS_CANMOUNT_ON },
@@ -166,162 +178,208 @@ zfs_prop_init(void)
{ NULL }
};
+ static zprop_index_t sync_table[] = {
+ { "standard", ZFS_SYNC_STANDARD },
+ { "always", ZFS_SYNC_ALWAYS },
+ { "disabled", ZFS_SYNC_DISABLED },
+ { NULL }
+ };
+
/* inherit index properties */
- register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "standard | always | disabled", "SYNC",
+ sync_table);
+ zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+ ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME,
"on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
checksum_table);
- register_index(ZFS_PROP_COMPRESSION, "compression",
+ zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | verify | sha256[,verify]", "DEDUP",
+ dedup_table);
+ zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
- register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+ "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+ compress_table);
+ zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"hidden | visible", "SNAPDIR", snapdir_table);
- register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
- PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
- "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
- register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
- PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+ ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"discard | noallow | restricted | passthrough | passthrough-x",
"ACLINHERIT", acl_inherit_table);
- register_index(ZFS_PROP_COPIES, "copies", 1,
- PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"1 | 2 | 3", "COPIES", copies_table);
- register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+ zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
ZFS_CACHE_ALL, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"all | none | metadata", "PRIMARYCACHE", cache_table);
- register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+ zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
ZFS_CACHE_ALL, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"all | none | metadata", "SECONDARYCACHE", cache_table);
+ zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "latency | throughput", "LOGBIAS", logbias_table);
/* inherit index (boolean) properties */
- register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
- register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
boolean_table);
- register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
boolean_table);
- register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
boolean_table);
- register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
boolean_table);
- register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
- register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
boolean_table);
- register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
boolean_table);
- register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+ zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
boolean_table);
/* default index properties */
- register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"1 | 2 | 3 | 4 | current", "VERSION", version_table);
- register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+ zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
"CANMOUNT", canmount_table);
/* readonly index (boolean) properties */
- register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+ zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+ boolean_table);
/* set once index properties */
- register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"none | formC | formD | formKC | formKD", "NORMALIZATION",
normalize_table);
- register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
- PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+ ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_SNAPSHOT,
"sensitive | insensitive | mixed", "CASE", case_table);
/* set once index (boolean) properties */
- register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"on | off", "UTF8ONLY", boolean_table);
/* string properties */
- register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
- register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
- ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
- register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
- ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
- register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
- ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
- register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+ zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+ "MOUNTPOINT");
+ zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+ "SHARENFS");
+ zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
- register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
- ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+ zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "on | off | sharemgr(1M) options", "SHARESMB");
+ zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+ ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+ "<sensitivity label>", "MLSLABEL");
/* readonly number properties */
- register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
ZFS_TYPE_DATASET, "<size>", "USED");
- register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
- register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
- ZFS_TYPE_DATASET, "<size>", "REFER");
- register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+ zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
+ zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
PROP_READONLY, ZFS_TYPE_DATASET,
"<1.00x or higher if compressed>", "RATIO");
- register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
- PROP_ONETIME,
+ zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+ ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
- register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
- register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
- register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
- register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+ zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDSNAP");
+ zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDDS");
+ zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDCHILD");
+ zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+ zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
/* default number properties */
- register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
- register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
- register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+ zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "RESERV");
+ zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
- register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+ zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
- register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+ zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<size> | none", "REFRESERV");
/* inherit number properties */
- register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
- PROP_INHERIT,
+ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+ SPA_MAXBLOCKSIZE, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
/* hidden properties */
- register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+ zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
- register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
- register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+ zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
- register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
- PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
- register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
- ZFS_TYPE_DATASET, "GUID");
- register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+ zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
+ "STMF_SBD_LU");
+ zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "GUID");
+ zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+ "USERACCOUNTING");
+ zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+ zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
+
+ /*
+ * Property to be removed once libbe is integrated
+ */
+ zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM,
+ "PRIV_PROP");
/* oddball properties */
- register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
- PROP_READONLY, ZFS_TYPE_DATASET,
+ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+ NULL, PROP_READONLY, ZFS_TYPE_DATASET,
"<date>", "CREATION", B_FALSE, B_TRUE, NULL);
}
@@ -329,6 +387,11 @@ boolean_t
zfs_prop_delegatable(zfs_prop_t prop)
{
zprop_desc_t *pd = &zfs_prop_table[prop];
+
+ /* The mlslabel property is never delegatable. */
+ if (prop == ZFS_PROP_MLSLABEL)
+ return (B_FALSE);
+
return (pd->pd_attr != PROP_READONLY);
}
@@ -413,6 +476,12 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
}
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
/*
* Returns TRUE if the property applies to any of the given dataset types.
*/
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
index da5ae43093e5..a63262311b3d 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _ZFS_PROP_H
#define _ZFS_PROP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/fs/zfs.h>
#include <sys/types.h>
@@ -79,6 +77,7 @@ typedef struct {
/* "zfs get" help message */
const zprop_index_t *pd_table; /* for index properties, a table */
/* defining the possible values */
+ size_t pd_table_size; /* number of entries in pd_table[] */
} zprop_desc_t;
/*
@@ -99,16 +98,16 @@ zprop_desc_t *zpool_prop_get_table(void);
/*
* Common routines to initialize property tables
*/
-void register_impl(int, const char *, zprop_type_t, uint64_t,
+void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
const char *, zprop_attr_t, int, const char *, const char *,
boolean_t, boolean_t, const zprop_index_t *);
-void register_string(int, const char *, const char *, zprop_attr_t attr,
- int, const char *, const char *);
-void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_string(int, const char *, const char *,
+ zprop_attr_t attr, int, const char *, const char *);
+void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
const char *, const char *);
-void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
const char *, const char *, const zprop_index_t *);
-void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
int, const char *);
/*
@@ -118,6 +117,7 @@ int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
int zprop_name_to_prop(const char *, zfs_type_t);
int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
const char *zprop_values(int, zfs_type_t);
size_t zprop_width(int, boolean_t *, zfs_type_t);
boolean_t zprop_valid_for_type(int, zfs_type_t);
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
index f5efe18d248b..988d05de6e20 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zio.h>
@@ -64,46 +63,57 @@ zpool_prop_init(void)
};
/* string properties */
- register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
ZFS_TYPE_POOL, "<path>", "ALTROOT");
- register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+ zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
- register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
- ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+ zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
/* readonly number properties */
- register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "SIZE");
- register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
- ZFS_TYPE_POOL, "<size>", "USED");
- register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
- ZFS_TYPE_POOL, "<size>", "AVAIL");
- register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+ zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "FREE");
+ zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+ zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "CAP");
- register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+ zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<guid>", "GUID");
- register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+ zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<state>", "HEALTH");
+ zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+ "DEDUP");
/* default number properties */
- register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+ zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
/* default index (boolean) properties */
- register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
- ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
- register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
- ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
- register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
- ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
+ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+ zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+ zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
/* default index properties */
- register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
"wait | continue | panic", "FAILMODE", failuremode_table);
/* hidden properties */
- register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_POOL, "NAME");
}
@@ -164,6 +174,12 @@ zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
}
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
#ifndef _KERNEL
const char *
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
index d3301b508029..4d7e79c0a4c3 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -65,7 +65,7 @@ zprop_get_numprops(zfs_type_t type)
}
void
-register_impl(int prop, const char *name, zprop_type_t type,
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
int objset_types, const char *values, const char *colname,
boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
@@ -76,6 +76,8 @@ register_impl(int prop, const char *name, zprop_type_t type,
pd = &prop_tbl[prop];
ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+ ASSERT(name != NULL);
+ ASSERT(colname != NULL);
pd->pd_name = name;
pd->pd_propnum = prop;
@@ -89,40 +91,44 @@ register_impl(int prop, const char *name, zprop_type_t type,
pd->pd_rightalign = rightalign;
pd->pd_visible = visible;
pd->pd_table = idx_tbl;
+ pd->pd_table_size = 0;
+ while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+ pd->pd_table_size++;
}
void
-register_string(int prop, const char *name, const char *def,
+zprop_register_string(int prop, const char *name, const char *def,
zprop_attr_t attr, int objset_types, const char *values,
const char *colname)
{
- register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+ zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
objset_types, values, colname, B_FALSE, B_TRUE, NULL);
}
void
-register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
- int objset_types, const char *values, const char *colname)
+zprop_register_number(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
{
- register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+ zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
objset_types, values, colname, B_TRUE, B_TRUE, NULL);
}
void
-register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
- int objset_types, const char *values, const char *colname,
- const zprop_index_t *idx_tbl)
+zprop_register_index(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname, const zprop_index_t *idx_tbl)
{
- register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+ zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
}
void
-register_hidden(int prop, const char *name, zprop_type_t type,
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
zprop_attr_t attr, int objset_types, const char *colname)
{
- register_impl(prop, name, type, 0, NULL, attr,
+ zprop_register_impl(prop, name, type, 0, NULL, attr,
objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
}
@@ -307,6 +313,25 @@ zprop_index_to_string(int prop, uint64_t index, const char **string,
return (-1);
}
+/*
+ * Return a random valid property value. Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+
+ ASSERT((uint_t)prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ idx_tbl = prop_tbl[prop].pd_table;
+
+ if (idx_tbl == NULL)
+ return (seed);
+
+ return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
const char *
zprop_values(int prop, zfs_type_t type)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 2aaf5bcdaff9..2ab1d7b8ea9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -20,8 +20,8 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+#
#
# This Makefile defines all file modules for the directory uts/common
# and its children. These are the source files which may be considered
@@ -30,8 +30,12 @@
ZFS_COMMON_OBJS += \
arc.o \
bplist.o \
+ bpobj.o \
dbuf.o \
+ ddt.o \
+ ddt_zap.o \
dmu.o \
+ dmu_diff.o \
dmu_send.o \
dmu_object.o \
dmu_objset.o \
@@ -41,17 +45,18 @@ ZFS_COMMON_OBJS += \
dnode_sync.o \
dsl_dir.o \
dsl_dataset.o \
+ dsl_deadlist.o \
dsl_pool.o \
dsl_synctask.o \
dmu_zfetch.o \
dsl_deleg.o \
dsl_prop.o \
- dsl_scrub.o \
- fletcher.o \
+ dsl_scan.o \
gzip.o \
lzjb.o \
metaslab.o \
refcount.o \
+ sa.o \
sha256.o \
spa.o \
spa_config.o \
@@ -75,20 +80,25 @@ ZFS_COMMON_OBJS += \
zap_leaf.o \
zap_micro.o \
zfs_byteswap.o \
+ zfs_debug.o \
zfs_fm.o \
zfs_fuid.o \
+ zfs_sa.o \
zfs_znode.o \
zil.o \
zio.o \
zio_checksum.o \
zio_compress.o \
- zio_inject.o
+ zio_inject.o \
+ zle.o \
+ zrlock.o
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
zfs_deleg.o \
zfs_prop.o \
zfs_comutil.o \
+ zfs_fletcher.o \
zpool_prop.o \
zprop_common.o
@@ -99,7 +109,9 @@ ZFS_OBJS += \
zfs_ctldir.o \
zfs_dir.o \
zfs_ioctl.o \
+ zfs_ioctl_compat.o \
zfs_log.o \
+ zfs_onexit.o \
zfs_replay.o \
zfs_rlock.o \
rrwlock.o \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index 269c3ebe75d8..436918b35c13 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -40,7 +40,6 @@
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/cred.h>
-#include <sys/kdb.h>
#include <sys/gfs.h>
@@ -108,6 +107,42 @@
* gfs_root_create_file()
*/
+#ifdef sun
+/*
+ * gfs_make_opsvec: take an array of vnode type definitions and create
+ * their vnodeops_t structures
+ *
+ * This routine takes an array of gfs_opsvec_t's. It could
+ * alternatively take an array of gfs_opsvec_t*'s, which would allow
+ * vnode types to be completely defined in files external to the caller
+ * of gfs_make_opsvec(). As it stands, much more sharing takes place --
+ * both the caller and the vnode type provider need to access gfsv_ops
+ * and gfsv_template, and the caller also needs to know gfsv_name.
+ */
+int
+gfs_make_opsvec(gfs_opsvec_t *vec)
+{
+ int error, i;
+
+ for (i = 0; ; i++) {
+ if (vec[i].gfsv_name == NULL)
+ return (0);
+ error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
+ vec[i].gfsv_ops);
+ if (error)
+ break;
+ }
+
+ cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
+ vec[i].gfsv_name);
+ for (i--; i >= 0; i--) {
+ vn_freevnodeops(*vec[i].gfsv_ops);
+ *vec[i].gfsv_ops = NULL;
+ }
+ return (error);
+}
+#endif /* sun */
+
/*
* Low level directory routines
*
@@ -312,6 +347,22 @@ gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
cookies));
}
+#ifdef sun
+/*
+ * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
+ * instead of a string for the entry's name.
+ */
+int
+gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
+ ino64_t ino, unsigned long num)
+{
+ char buf[40];
+
+ numtos(num, buf);
+ return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
+}
+#endif
+
/*
* gfs_readdir_pred: readdir loop predicate
* voffp - a pointer in which the next virtual offset should be stored
@@ -542,6 +593,28 @@ gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
return (vp);
}
+#ifdef sun
+/*
+ * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
+ *
+ * Similar to gfs_root_create(), this creates a root vnode for a file to
+ * be the pseudo-filesystem.
+ */
+vnode_t *
+gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
+{
+ vnode_t *vp = gfs_file_create(size, NULL, ops);
+
+ ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
+
+ VFS_HOLD(vfsp);
+ VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
+ vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
+
+ return (vp);
+}
+#endif /* sun */
+
/*
* gfs_file_inactive()
*
@@ -570,7 +643,7 @@ gfs_file_inactive(vnode_t *vp)
*/
if ((dp = fp->gfs_parent->v_data) == NULL)
return (NULL);
-
+
/*
* First, see if this vnode is cached in the parent.
*/
@@ -995,6 +1068,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
return (gfs_readdir_fini(&gstate, error, eofp, eof));
}
+
/*
* gfs_vop_lookup: VOP_LOOKUP() entry point
*
@@ -1062,6 +1136,81 @@ gfs_vop_readdir(ap)
return (error);
}
+
+#ifdef sun
+/*
+ * gfs_vop_map: VOP_MAP() entry point
+ *
+ * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
+ * This function only works for readonly files, and uses the read function for
+ * the vnode to fill in the data. The mapped data is immediately faulted in and
+ * filled with the necessary data during this call; there are no getpage() or
+ * putpage() routines.
+ */
+/* ARGSUSED */
+int
+gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
+ caller_context_t *ct)
+{
+ int rv;
+ ssize_t resid = len;
+
+ /*
+ * Check for bad parameters
+ */
+#ifdef _ILP32
+ if (len > MAXOFF_T)
+ return (ENOMEM);
+#endif
+ if (vp->v_flag & VNOMAP)
+ return (ENOTSUP);
+ if (off > MAXOFF_T)
+ return (EFBIG);
+ if ((long)off < 0 || (long)(off + len) < 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+ if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
+ return (EACCES);
+
+ /*
+ * Find appropriate address if needed, otherwise clear address range.
+ */
+ as_rangelock(as);
+ rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (rv != 0) {
+ as_rangeunlock(as);
+ return (rv);
+ }
+
+ /*
+ * Create mapping
+ */
+ rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
+ as_rangeunlock(as);
+ if (rv != 0)
+ return (rv);
+
+ /*
+ * Fill with data from read()
+ */
+ rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
+ 0, (rlim64_t)0, cred, &resid);
+
+ if (rv == 0 && resid != 0)
+ rv = ENXIO;
+
+ if (rv != 0) {
+ as_rangelock(as);
+ (void) as_unmap(as, *addrp, len);
+ as_rangeunlock(as);
+ }
+
+ return (rv);
+}
+#endif /* sun */
+
/*
* gfs_vop_inactive: VOP_INACTIVE() entry point
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
index f4e2449f018f..83f29c154de9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -18,9 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -75,7 +75,6 @@ xva_getxoptattr(xvattr_t *xvap)
static void
vn_rele_inactive(vnode_t *vp)
{
-
vrele(vp);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 38b39bf021fb..2adad8ad726c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -119,16 +118,17 @@
#include <sys/spa.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <zfs_fletcher.h>
#include <sys/sdt.h>
#include <vm/vm_pageout.h>
@@ -178,7 +178,6 @@ static boolean_t arc_warm;
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
@@ -186,14 +185,11 @@ int zfs_arc_p_min_shift = 0;
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
SYSCTL_DECL(_vfs_zfs);
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
"Maximum ARC size");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
"Minimum ARC size");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
* Note that buffers can be in one of 6 states:
@@ -500,6 +496,7 @@ struct arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
+ void *b_thawed;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
@@ -560,7 +557,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
-#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -609,8 +605,8 @@ static buf_hash_table_t buf_hash_table;
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(buf) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
@@ -634,7 +630,7 @@ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
@@ -788,6 +784,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+}
+
static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
@@ -921,7 +926,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
arc_buf_t *buf = vbuf;
bzero(buf, sizeof (arc_buf_t));
- rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
@@ -937,6 +943,7 @@ hdr_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *buf = vbuf;
+ ASSERT(BUF_EMPTY(buf));
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
@@ -949,7 +956,8 @@ buf_dest(void *vbuf, void *unused)
{
arc_buf_t *buf = vbuf;
- rw_destroy(&buf->b_lock);
+ mutex_destroy(&buf->b_evict_lock);
+ rw_destroy(&buf->b_data_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
@@ -1077,18 +1085,31 @@ arc_buf_thaw(arc_buf_t *buf)
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_thawed)
+ kmem_free(buf->b_hdr->b_thawed, 1);
+ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
void
arc_buf_freeze(arc_buf_t *buf)
{
+ kmutex_t *hash_lock;
+
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
+ mutex_exit(hash_lock);
}
static void
@@ -1111,7 +1132,6 @@ get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lo
static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
-
ASSERT(MUTEX_HELD(hash_lock));
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
@@ -1185,6 +1205,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
from_delta = to_delta = ab->b_datacnt * ab->b_size;
@@ -1207,7 +1228,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
/*
* If prefetching out of the ghost cache,
- * we will have a non-null datacnt.
+ * we will have a non-zero datacnt.
*/
if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
/* ghost elements have a ghost size */
@@ -1245,9 +1266,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon) {
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
buf_hash_remove(ab);
- }
/* adjust state sizes */
if (to_delta)
@@ -1391,14 +1411,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_state == arc_anon);
ASSERT(buf->b_data != NULL);
- VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
- VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
@@ -1406,6 +1441,8 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
+ ASSERT(hdr->b_state != arc_anon);
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@@ -1430,16 +1467,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
* must verify b_data != NULL to know if the add_ref
* was successful.
*/
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
if (buf->b_data == NULL) {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return;
}
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
- rw_exit(&buf->b_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ mutex_exit(&buf->b_evict_lock);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
@@ -1487,6 +1524,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
+
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1524,6 +1562,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
+ buf->b_next = NULL;
ASSERT(buf->b_efunc == NULL);
@@ -1538,55 +1577,55 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!(hdr->b_flags & ARC_STORED));
+ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
- if (hdr->b_l2hdr != NULL) {
- if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
+ if (l2hdr != NULL) {
+ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ if (!buflist_held) {
mutex_enter(&l2arc_buflist_mtx);
- if (hdr->b_l2hdr != NULL) {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
- hdr);
- }
- mutex_exit(&l2arc_buflist_mtx);
- } else {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ l2hdr = hdr->b_l2hdr;
}
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
+
+ if (l2hdr != NULL) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!buflist_held)
+ mutex_exit(&l2arc_buflist_mtx);
}
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
}
while (hdr->b_buf) {
arc_buf_t *buf = hdr->b_buf;
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1596,6 +1635,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
+ if (hdr->b_thawed) {
+ kmem_free(hdr->b_thawed, 1);
+ hdr->b_thawed = NULL;
+ }
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1616,11 +1659,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1)
+ if (hdr->b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
- else
+ } else {
+ ASSERT(buf == hdr->b_buf);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
mutex_exit(hash_lock);
} else if (HDR_IO_IN_PROGRESS(hdr)) {
int destroy_hdr;
@@ -1637,12 +1686,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
- if (remove_reference(hdr, NULL, tag) > 0) {
- ASSERT(HDR_IO_ERROR(hdr));
+ if (remove_reference(hdr, NULL, tag) > 0)
arc_buf_destroy(buf, FALSE, TRUE);
- } else {
+ else
arc_hdr_destroy(hdr);
- }
}
}
@@ -1654,11 +1701,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
int no_callback = (buf->b_efunc == NULL);
if (hdr->b_state == arc_anon) {
+ ASSERT(hdr->b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT(hdr->b_state != arc_anon);
ASSERT(buf->b_data != NULL);
@@ -1668,6 +1718,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
}
ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1747,7 +1798,8 @@ evict_start:
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
- LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - ab->b_arc_access <
+ arc_min_prefetch_lifespan)) {
skipped++;
continue;
}
@@ -1762,7 +1814,7 @@ evict_start:
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
- if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
@@ -1784,9 +1836,9 @@ evict_start:
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
@@ -1887,6 +1939,7 @@ static void
arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t marker = { 0 };
list_t *list, *list_start;
kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
@@ -1913,7 +1966,15 @@ evict_start:
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
hash_lock = HDR_LOCK(ab);
+ /* caller may be trying to modify this buffer, skip it */
+ if (MUTEX_HELD(hash_lock))
+ continue;
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
@@ -1936,18 +1997,21 @@ evict_start:
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
- } else {
- if (bytes < 0) {
- /*
- * we're draining the ARC, retry
- */
- mutex_exit(lock);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto evict_start;
- }
+ } else if (bytes < 0) {
+ /*
+ * Insert a list marker and then wait for the
+ * hash lock to become available. Once its
+ * available, restart from where we left off.
+ */
+ list_insert_after(list, ab, &marker);
+ mutex_exit(lock);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ mutex_enter(lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ } else
bufs_skipped += 1;
- }
}
mutex_exit(lock);
idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
@@ -2056,9 +2120,9 @@ restart:
while (tmp_arc_eviction_list != NULL) {
arc_buf_t *buf = tmp_arc_eviction_list;
tmp_arc_eviction_list = buf->b_next;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
buf->b_hdr = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
if (buf->b_efunc != NULL)
VERIFY(buf->b_efunc(buf) == 0);
@@ -2148,11 +2212,9 @@ static int needfree = 0;
static int
arc_reclaim_needed(void)
{
-#if 0
- uint64_t extra;
-#endif
#ifdef _KERNEL
+
if (needfree)
return (1);
@@ -2163,7 +2225,7 @@ arc_reclaim_needed(void)
if (vm_paging_needed())
return (1);
-#if 0
+#ifdef sun
/*
* take 'desfree' extra pages, so we reclaim sooner, rather than later
*/
@@ -2205,10 +2267,10 @@ arc_reclaim_needed(void)
(btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
return (1);
#endif
-#else
+#else /* !sun */
if (kmem_used() > (kmem_size() * 3) / 4)
return (1);
-#endif
+#endif /* sun */
#else
if (spa_get_random(100) == 0)
@@ -2290,7 +2352,7 @@ arc_reclaim_thread(void *dummy __unused)
}
/* reset the growth delay for every reclaim */
- growtime = LBOLT + (arc_grow_retry * hz);
+ growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
/*
@@ -2304,7 +2366,7 @@ arc_reclaim_thread(void *dummy __unused)
arc_kmem_reap_now(last_reclaim);
arc_warm = B_TRUE;
- } else if (arc_no_grow && LBOLT >= growtime) {
+ } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
arc_no_grow = FALSE;
}
@@ -2411,7 +2473,7 @@ arc_evict_needed(arc_buf_contents_t type)
if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
return (1);
-#if 0
+#ifdef sun
#ifdef _KERNEL
/*
* If zio data pages are being allocated out of a separate heap segment,
@@ -2423,7 +2485,7 @@ arc_evict_needed(arc_buf_contents_t type)
(vmem_size(zio_arena, VMEM_ALLOC) >> 5))
return (1);
#endif
-#endif
+#endif /* sun */
if (arc_reclaim_needed())
return (1);
@@ -2543,6 +2605,8 @@ out:
static void
arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
{
+ clock_t now;
+
ASSERT(MUTEX_HELD(hash_lock));
if (buf->b_state == arc_anon) {
@@ -2553,11 +2617,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
*/
ASSERT(buf->b_arc_access == 0);
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
arc_change_state(arc_mru, buf, hash_lock);
} else if (buf->b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
/*
* If this buffer is here because of a prefetch, then either:
* - clear the flag if this is a "referencing" read
@@ -2573,7 +2639,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = now;
return;
}
@@ -2582,13 +2648,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
+ if (now > buf->b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
@@ -2611,7 +2677,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2630,7 +2696,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(list_link_active(&buf->b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
@@ -2648,7 +2714,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
new_state = arc_mru;
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
@@ -2658,7 +2724,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* This buffer is on the 2nd Level ARC.
*/
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
} else {
@@ -2671,7 +2737,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ if (zio == NULL || zio->io_error == 0)
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
@@ -2685,6 +2752,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
*bufp = NULL;
} else {
*bufp = buf;
+ ASSERT(buf->b_data);
}
}
@@ -2732,6 +2800,16 @@ arc_read_done(zio_t *zio)
arc_cksum_compute(buf, B_FALSE);
+ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
/* create copies of the data buffer for the callers */
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2745,8 +2823,11 @@ arc_read_done(zio_t *zio)
hdr->b_acb = NULL;
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf)
+ if (abuf == buf) {
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(hdr->b_datacnt == 1);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
@@ -2767,14 +2848,6 @@ arc_read_done(zio_t *zio)
cv_broadcast(&hdr->b_cv);
if (hash_lock) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- if (zio->io_error == 0 && hdr->b_state == arc_anon)
- arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
/*
@@ -2825,27 +2898,37 @@ arc_read_done(zio_t *zio)
*
* Normal callers should use arc_read and pass the arc buffer and offset
* for the bp. But if you know you don't need locking, you can use
- * arc_read_bp.
+ * arc_read_nolock.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
+ if (pbuf == NULL) {
+ /*
+ * XXX This happens from traverse callback funcs, for
+ * the objset_phys_t block.
+ */
+ return (arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb));
+ }
+
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_lock, RW_READER);
+ rw_enter(&pbuf->b_data_lock, RW_READER);
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
- rw_exit(&pbuf->b_lock);
+ rw_exit(&pbuf->b_data_lock);
+
return (err);
}
int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
@@ -2856,7 +2939,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
uint64_t guid = spa_guid(spa);
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
if (hdr && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@@ -2910,6 +2994,7 @@ top:
} else {
buf = arc_buf_clone(buf);
}
+
} else if (*arc_flags & ARC_PREFETCH &&
refcount_count(&hdr->b_refcnt) == 0) {
hdr->b_flags |= ARC_PREFETCH;
@@ -2940,15 +3025,13 @@ top:
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = bp->blk_birth;
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
if (exists) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
(void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
@@ -2983,12 +3066,14 @@ top:
buf->b_private = NULL;
buf->b_next = NULL;
hdr->b_buf = buf;
- arc_get_data_buf(buf);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
-
+ arc_get_data_buf(buf);
+ arc_access(hdr, hash_lock);
}
+ ASSERT(!GHOST_STATE(hdr->b_state));
+
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
@@ -2997,17 +3082,6 @@ top:
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
- /*
- * If the buffer has been evicted, migrate it to a present state
- * before issuing the I/O. Once we drop the hash-table lock,
- * the header will be marked as I/O in progress and have an
- * attached buffer. At this point, anybody who finds this
- * buffer ought to notice that it's legit but has a pending I/O.
- */
-
- if (GHOST_STATE(hdr->b_state))
- arc_access(hdr, hash_lock);
-
if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -3023,8 +3097,8 @@ top:
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
- zbookmark_t *, zb);
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+ uint64_t, size, zbookmark_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -3110,47 +3184,15 @@ top:
return (0);
}
-/*
- * arc_read() variant to support pool traversal. If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_mtx;
- uint64_t guid = spa_guid(spa);
- int rc = 0;
-
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
- if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
-
- ASSERT(buf);
- while (buf->b_data == NULL) {
- buf = buf->b_next;
- ASSERT(buf);
- }
- bcopy(buf->b_data, data, hdr->b_size);
- } else {
- rc = ENOENT;
- }
-
- if (hash_mtx)
- mutex_exit(hash_mtx);
-
- return (rc);
-}
-
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
ASSERT(buf->b_hdr != NULL);
ASSERT(buf->b_hdr->b_state != arc_anon);
ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
buf->b_efunc = func;
buf->b_private = private;
}
@@ -3169,14 +3211,14 @@ arc_buf_evict(arc_buf_t *buf)
list_t *list, *evicted_list;
kmutex_t *lock, *evicted_lock;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (0);
} else if (buf->b_data == NULL) {
arc_buf_t copy = *buf; /* structure assignment */
@@ -3185,14 +3227,15 @@ arc_buf_evict(arc_buf_t *buf)
* but let arc_do_user_evicts() do the reaping.
*/
buf->b_efunc = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(copy.b_efunc(&copy) == 0);
return (1);
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -3211,6 +3254,7 @@ arc_buf_evict(arc_buf_t *buf)
arc_state_t *old_state = hdr->b_state;
arc_state_t *evicted_state;
+ ASSERT(hdr->b_buf == NULL);
ASSERT(refcount_is_zero(&hdr->b_refcnt));
evicted_state =
@@ -3230,12 +3274,13 @@ arc_buf_evict(arc_buf_t *buf)
mutex_exit(lock);
}
mutex_exit(hash_lock);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_hdr = NULL;
+ buf->b_next = NULL;
kmem_cache_free(buf_cache, buf);
return (1);
}
@@ -3250,29 +3295,30 @@ void
arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size;
- boolean_t released = B_FALSE;
- rw_enter(&buf->b_lock, RW_WRITER);
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
- ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
- ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
- arc_buf_thaw(buf);
- rw_exit(&buf->b_lock);
- released = B_TRUE;
} else {
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
}
l2hdr = hdr->b_l2hdr;
@@ -3282,9 +3328,6 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size;
}
- if (released)
- goto out;
-
/*
* Do we have more than one buf?
*/
@@ -3298,14 +3341,14 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
- * Pull the data off of this buf and attach it to
- * a new anonymous buf.
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
bufp = &hdr->b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
- *bufp = (*bufp)->b_next;
+ *bufp = buf->b_next;
buf->b_next = NULL;
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3333,26 +3376,25 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
- mutex_exit(hash_lock);
+ if (hash_lock)
+ mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
-out:
if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3361,14 +3403,27 @@ out:
}
}
+/*
+ * Release this buffer. If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb)
+{
+ arc_release(buf, tag);
+ return (0);
+}
+
int
arc_released(arc_buf_t *buf)
{
int released;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (released);
}
@@ -3377,9 +3432,9 @@ arc_has_callback(arc_buf_t *buf)
{
int callback;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
callback = (buf->b_efunc != NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (callback);
}
@@ -3389,9 +3444,9 @@ arc_referenced(arc_buf_t *buf)
{
int referenced;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
referenced = (refcount_count(&buf->b_hdr->b_refcnt));
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (referenced);
}
#endif
@@ -3431,21 +3486,28 @@ arc_write_done(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- hdr->b_acb = NULL;
+ ASSERT(hdr->b_acb == NULL);
+
+ if (zio->io_error == 0) {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ } else {
+ ASSERT(BUF_EMPTY(hdr));
+ }
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = zio->io_bp->blk_birth;
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
- * so there will be no dva/birth-date/checksum. The buffer
- * must therefor remain anonymous (and uncached).
+ * so there will be no dva/birth/checksum. The buffer must
+ * therefore remain anonymous (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
+ ASSERT(zio->io_error == 0);
+
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
@@ -3455,106 +3517,54 @@ arc_write_done(zio_t *zio)
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
- ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
- ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
- BP_IDENTITY(zio->io_bp)));
- ASSERT3U(zio->io_bp_orig.blk_birth, ==,
- zio->io_bp->blk_birth);
-
- ASSERT(refcount_is_zero(&exists->b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (hdr->b_state == arc_anon)
+ if (!exists && hdr->b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
- } else if (callback->awcb_done == NULL) {
- int destroy_hdr;
- /*
- * This is an anonymous buffer with no user callback,
- * destroy it if there are no active references.
- */
- mutex_enter(&arc_eviction_mtx);
- destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- hdr->b_flags &= ~ARC_STORED;
- if (callback->awcb_done) {
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
- }
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
}
-static void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
- boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
- /* Determine checksum setting */
- if (ismd) {
- /*
- * Metadata always gets checksummed. If the data
- * checksum is multi-bit correctable, and it's not a
- * ZBT-style checksum, then it's suitable for metadata
- * as well. Otherwise, the metadata checksum defaults
- * to fletcher4.
- */
- if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
- !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
- zp->zp_checksum = wp->wp_oschecksum;
- else
- zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
- wp->wp_oschecksum);
- }
-
- /* Determine compression setting */
- if (ismd) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
- } else {
- zp->zp_compress = zio_compress_select(wp->wp_dncompress,
- wp->wp_oscompress);
- }
-
- zp->zp_type = wp->wp_type;
- zp->zp_level = wp->wp_level;
- zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
- zio_prop_t zp;
ASSERT(ready != NULL);
+ ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == 0);
+ ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3563,103 +3573,27 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
callback->awcb_private = private;
callback->awcb_buf = buf;
- write_policy(spa, wp, &zp);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- arc_buf_hdr_t *ab;
- kmutex_t *hash_lock;
- zio_t *zio;
- uint64_t guid = spa_guid(spa);
-
- /*
- * If this buffer is in the cache, release it, so it
- * can be re-used.
- */
- ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (ab != NULL) {
- /*
- * The checksum of blocks to free is not always
- * preserved (eg. on the deadlist). However, if it is
- * nonzero, it should match what we have in the cache.
- */
- ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
- bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
- if (ab->b_state != arc_anon)
- arc_change_state(arc_anon, ab, hash_lock);
- if (HDR_IO_IN_PROGRESS(ab)) {
- /*
- * This should only happen when we prefetch.
- */
- ASSERT(ab->b_flags & ARC_PREFETCH);
- ASSERT3U(ab->b_datacnt, ==, 1);
- ab->b_flags |= ARC_FREED_IN_READ;
- if (HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- } else if (refcount_is_zero(&ab->b_refcnt)) {
- ab->b_flags |= ARC_FREE_IN_PROGRESS;
- mutex_exit(hash_lock);
- arc_hdr_destroy(ab);
- ARCSTAT_BUMP(arcstat_deleted);
- } else {
- /*
- * We still have an active reference on this
- * buffer. This can happen, e.g., from
- * dbuf_unoverride().
- */
- ASSERT(!HDR_IN_HASH_TABLE(ab));
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- }
- }
-
- zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
- if (arc_flags & ARC_WAIT)
- return (zio_wait(zio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(zio);
-
- return (0);
-}
-
static int
arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count
- + cnt.v_cache_count);
+ uint64_t available_memory =
+ ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
-#if 0
+#ifdef sun
#if defined(__i386)
available_memory =
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
-#endif
+#endif /* sun */
if (available_memory >= zfs_write_limit_max)
return (0);
@@ -3776,10 +3710,12 @@ arc_lowmem(void *arg __unused, int howto __unused)
/* Serialize access via arc_lowmem_lock. */
mutex_enter(&arc_lowmem_lock);
+ mutex_enter(&arc_reclaim_thr_lock);
needfree = 1;
cv_signal(&arc_reclaim_thr_cv);
while (needfree)
- tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
+ msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
+ mutex_exit(&arc_reclaim_thr_lock);
mutex_exit(&arc_lowmem_lock);
}
#endif
@@ -3787,8 +3723,7 @@ arc_lowmem(void *arg __unused, int howto __unused)
void
arc_init(void)
{
- int prefetch_tunable_set = 0;
- int i;
+ int i, prefetch_tunable_set = 0;
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
@@ -3799,7 +3734,8 @@ arc_init(void)
/* Start out with 1/8 of all memory */
arc_c = kmem_size() / 8;
-#if 0
+
+#ifdef sun
#ifdef _KERNEL
/*
* On architectures where the physical memory can be larger
@@ -3808,7 +3744,7 @@ arc_init(void)
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
#endif
-#endif
+#endif /* sun */
/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<18);
/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
@@ -3817,16 +3753,18 @@ arc_init(void)
else
arc_c_max = arc_c_min;
arc_c_max = MAX(arc_c * 5, arc_c_max);
+
#ifdef _KERNEL
/*
* Allow the tunables to override our calculations if they are
* reasonable (ie. over 16MB)
*/
- if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
+ if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
arc_c_max = zfs_arc_max;
- if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
+ if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
arc_c_min = zfs_arc_min;
#endif
+
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
@@ -3936,7 +3874,7 @@ arc_init(void)
"-- to enable,\n");
printf(" add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
- zfs_prefetch_disable=1;
+ zfs_prefetch_disable = 1;
}
#else
if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
@@ -3945,7 +3883,7 @@ arc_init(void)
"than 4GB of RAM is present;\n"
" to enable, add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
- zfs_prefetch_disable=1;
+ zfs_prefetch_disable = 1;
}
#endif
/* Warn about ZFS memory and address space requirements. */
@@ -4199,7 +4137,7 @@ l2arc_write_size(l2arc_dev_t *dev)
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
- clock_t interval, next;
+ clock_t interval, next, now;
/*
* If the ARC lists are busy, increase our write rate; if the
@@ -4212,7 +4150,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
else
interval = hz * l2arc_feed_secs;
- next = MAX(LBOLT, MIN(LBOLT + interval, began + interval));
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
return (next);
}
@@ -4414,11 +4353,11 @@ l2arc_read_done(zio_t *zio)
ASSERT(cb != NULL);
buf = cb->l2rcb_buf;
ASSERT(buf != NULL);
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
* Check this survived the L2ARC journey.
@@ -4632,7 +4571,7 @@ top:
}
mutex_exit(&l2arc_buflist_mtx);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
dev->l2ad_evict = taddr;
}
@@ -4802,15 +4741,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- spa_l2cache_space_update(dev->l2ad_vdev, 0,
- dev->l2ad_end - dev->l2ad_hand);
+ vdev_space_update(dev->l2ad_vdev,
+ dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
@@ -4834,7 +4773,7 @@ l2arc_feed_thread(void *dummy __unused)
l2arc_dev_t *dev;
spa_t *spa;
uint64_t size, wrote;
- clock_t begin, next = LBOLT;
+ clock_t begin, next = ddi_get_lbolt();
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
@@ -4843,9 +4782,9 @@ l2arc_feed_thread(void *dummy __unused)
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- next - LBOLT);
+ next - ddi_get_lbolt());
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
- next = LBOLT + hz;
+ next = ddi_get_lbolt() + hz;
/*
* Quick check for L2ARC devices.
@@ -4856,7 +4795,7 @@ l2arc_feed_thread(void *dummy __unused)
continue;
}
mutex_exit(&l2arc_dev_mtx);
- begin = LBOLT;
+ begin = ddi_get_lbolt();
/*
* This selects the next l2arc device to write to, and in
@@ -4875,6 +4814,16 @@ l2arc_feed_thread(void *dummy __unused)
ASSERT(spa != NULL);
/*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
* Avoid contributing to memory pressure.
*/
if (arc_reclaim_needed()) {
@@ -4931,7 +4880,7 @@ l2arc_vdev_present(vdev_t *vd)
* validated the vdev and opened it.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
@@ -4945,8 +4894,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_vdev = vd;
adddev->l2ad_write = l2arc_write_max;
adddev->l2ad_boost = l2arc_write_boost;
- adddev->l2ad_start = start;
- adddev->l2ad_end = end;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
@@ -4961,7 +4910,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2node));
- spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
/*
* Add device to global list
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
index 93b7741d77be..066ccc6b1e05 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -19,331 +19,51 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/bplist.h>
#include <sys/zfs_context.h>
-static int
-bplist_hold(bplist_t *bpl)
-{
- ASSERT(MUTEX_HELD(&bpl->bpl_lock));
- if (bpl->bpl_dbuf == NULL) {
- int err = dmu_bonus_hold(bpl->bpl_mos,
- bpl->bpl_object, bpl, &bpl->bpl_dbuf);
- if (err)
- return (err);
- bpl->bpl_phys = bpl->bpl_dbuf->db_data;
- }
- return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
- int size;
-
- size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
- BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
- return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
- DMU_OT_BPLIST_HDR, size, tx));
-}
void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+bplist_create(bplist_t *bpl)
{
- VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
- dmu_object_info_t doi;
- int err;
-
- err = dmu_object_info(mos, object, &doi);
- if (err)
- return (err);
-
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_dbuf == NULL);
- ASSERT(bpl->bpl_phys == NULL);
- ASSERT(bpl->bpl_cached_dbuf == NULL);
- ASSERT(bpl->bpl_queue == NULL);
- ASSERT(object != 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
- bpl->bpl_mos = mos;
- bpl->bpl_object = object;
- bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
- bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
- bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
- mutex_exit(&bpl->bpl_lock);
- return (0);
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
}
void
-bplist_close(bplist_t *bpl)
-{
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_queue == NULL);
-
- if (bpl->bpl_cached_dbuf) {
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- bpl->bpl_cached_dbuf = NULL;
- }
- if (bpl->bpl_dbuf) {
- dmu_buf_rele(bpl->bpl_dbuf, bpl);
- bpl->bpl_dbuf = NULL;
- bpl->bpl_phys = NULL;
- }
-
- mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
- boolean_t rv;
-
- if (bpl->bpl_object == 0)
- return (B_TRUE);
-
- mutex_enter(&bpl->bpl_lock);
- VERIFY(0 == bplist_hold(bpl)); /* XXX */
- rv = (bpl->bpl_phys->bpl_entries == 0);
- mutex_exit(&bpl->bpl_lock);
-
- return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
- int err = 0;
-
- if (bpl->bpl_cached_dbuf == NULL ||
- bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
- if (bpl->bpl_cached_dbuf != NULL)
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- err = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blkid << bpl->bpl_blockshift,
- bpl, &bpl->bpl_cached_dbuf);
- ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
- 1ULL << bpl->bpl_blockshift);
- }
- return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- if (*itorp >= bpl->bpl_phys->bpl_entries) {
- mutex_exit(&bpl->bpl_lock);
- return (ENOENT);
- }
-
- blk = *itorp >> bpl->bpl_bpshift;
- off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- bparray = bpl->bpl_cached_dbuf->db_data;
- *bp = bparray[off];
- (*itorp)++;
- mutex_exit(&bpl->bpl_lock);
- return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+bplist_destroy(bplist_t *bpl)
{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err)
- return (err);
-
- blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
- off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
- bparray = bpl->bpl_cached_dbuf->db_data;
- bparray[off] = *bp;
-
- /* We never need the fill count. */
- bparray[off].blk_fill = 0;
-
- /* The bplist will compress better if we can leave off the checksum */
- bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- bpl->bpl_phys->bpl_entries++;
- bpl->bpl_phys->bpl_bytes +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
- bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
- }
- mutex_exit(&bpl->bpl_lock);
-
- return (0);
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
}
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
{
- bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
- ASSERT(!BP_IS_HOLE(bp));
mutex_enter(&bpl->bpl_lock);
- bpq->bpq_blk = *bp;
- bpq->bpq_next = bpl->bpl_queue;
- bpl->bpl_queue = bpq;
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
}
void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
{
- bplist_q_t *bpq;
+ bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpq = bpl->bpl_queue) != NULL) {
- bpl->bpl_queue = bpq->bpq_next;
+ while (bpe = list_head(&bpl->bpl_list)) {
+ list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
- VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
- kmem_free(bpq, sizeof (*bpq));
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
mutex_enter(&bpl->bpl_lock);
}
mutex_exit(&bpl->bpl_lock);
}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
- mutex_enter(&bpl->bpl_lock);
- ASSERT3P(bpl->bpl_queue, ==, NULL);
- VERIFY(0 == bplist_hold(bpl));
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- VERIFY(0 == dmu_free_range(bpl->bpl_mos,
- bpl->bpl_object, 0, -1ULL, tx));
- bpl->bpl_phys->bpl_entries = 0;
- bpl->bpl_phys->bpl_bytes = 0;
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp = 0;
- bpl->bpl_phys->bpl_uncomp = 0;
- }
- mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- *usedp = bpl->bpl_phys->bpl_bytes;
- if (bpl->bpl_havecomp) {
- *compp = bpl->bpl_phys->bpl_comp;
- *uncompp = bpl->bpl_phys->bpl_uncomp;
- }
- mutex_exit(&bpl->bpl_lock);
-
- if (!bpl->bpl_havecomp) {
- uint64_t itor = 0, comp = 0, uncomp = 0;
- blkptr_t bp;
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- comp += BP_GET_PSIZE(&bp);
- uncomp += BP_GET_UCSIZE(&bp);
- }
- if (err == ENOENT)
- err = 0;
- *compp = comp;
- *uncompp = uncomp;
- }
-
- return (err);
-}
-
-/*
- * Return (in *dasizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
- uint64_t *dasizep)
-{
- uint64_t size = 0;
- uint64_t itor = 0;
- blkptr_t bp;
- int err;
-
- /*
- * As an optimization, if they want the whole txg range, just
- * get bpl_bytes rather than iterating over the bps.
- */
- if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err == 0)
- *dasizep = bpl->bpl_phys->bpl_bytes;
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
- size +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
- }
- }
- if (err == ENOENT)
- err = 0;
- *dasizep = size;
- return (err);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
new file mode 100644
index 000000000000..72be31235607
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
@@ -0,0 +1,495 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+ boolean_t free)
+{
+ dmu_object_info_t doi;
+ int epb;
+ int64_t i;
+ int err = 0;
+ dmu_buf_t *dbuf = NULL;
+
+ mutex_enter(&bpo->bpo_lock);
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ blkptr_t *bparray;
+ blkptr_t *bp;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (blkptr_t);
+ blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+ FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ bparray = dbuf->db_data;
+ bp = &bparray[blkoff];
+ err = func(arg, bp, tx);
+ if (err)
+ break;
+ if (free) {
+ bpo->bpo_phys->bpo_bytes -=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ i++;
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+ i * sizeof (blkptr_t), -1ULL, tx));
+ }
+ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ ASSERT(bpo->bpo_havecomp);
+ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+ if (err) {
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+ }
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+ bpobj_t sublist;
+ uint64_t used_before, comp_before, uncomp_before;
+ uint64_t used_after, comp_after, uncomp_after;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+ if (err)
+ break;
+ if (free) {
+ err = bpobj_space(&sublist,
+ &used_before, &comp_before, &uncomp_before);
+ if (err)
+ break;
+ }
+ err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+ if (free) {
+ VERIFY3U(0, ==, bpobj_space(&sublist,
+ &used_after, &comp_after, &uncomp_after));
+ bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
+ bpo->bpo_phys->bpo_uncomp -=
+ uncomp_before - uncomp_after;
+ }
+
+ bpobj_close(&sublist);
+ if (err)
+ break;
+ if (free) {
+ err = dmu_object_free(bpo->bpo_os,
+ objarray[blkoff], tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_num_subobjs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ (i + 1) * sizeof (uint64_t), -1ULL, tx));
+ }
+
+out:
+ /* If there are no entries, there should be no bytes. */
+ ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+ bpo->bpo_phys->bpo_bytes == 0);
+
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp, subsubobjs;
+
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+ if (used == 0) {
+ /* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces
+ * recursion in bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset == doi.doi_data_block_size) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+ subsubobjs, tx));
+ }
+ }
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+
+ bpobj_close(&subbpo);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /* We never need the fill count. */
+ stored_bp.blk_fill = 0;
+
+ /* The bpobj will compress better if we can leave off the checksum */
+ if (!BP_GET_DEDUP(bp))
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ bpo->bpo_phys->bpo_bytes +=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index cf983e234df5..f6b2d99d3285 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -34,12 +33,12 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
/*
* Global data structures and functions for the dbuf cache.
@@ -107,7 +106,7 @@ dmu_buf_impl_t *
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
uint64_t obj = dn->dn_object;
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
@@ -138,7 +137,7 @@ static dmu_buf_impl_t *
dbuf_hash_insert(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = db->db_objset;
+ objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
uint64_t blkid = db->db_blkid;
@@ -218,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db)
db->db_evict_func = NULL;
}
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
void
dbuf_evict(dmu_buf_impl_t *db)
{
@@ -282,7 +297,8 @@ dbuf_fini(void)
static void
dbuf_verify(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db)
return;
ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if (dn == NULL) {
ASSERT(db->db_parent == NULL);
ASSERT(db->db_blkptr == NULL);
@@ -297,24 +315,35 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- list_head(&dn->dn_dbufs));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !list_is_empty(&dn->dn_dbufs));
}
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, 0);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
+ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
/*
* We can't assert that db_size matches dn_datablksz because it
* can be momentarily different when another thread is doing
* dnode_set_blksz().
*/
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
+ dr = db->db_data_pending;
/*
* It should only be modified in syncing context, so
* make sure we only have one copy of the data.
@@ -331,8 +360,9 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
} else {
/* db is pointed to by an indirect block */
int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
@@ -344,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
* have the struct_rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
@@ -352,7 +382,8 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
- db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && !dn->dn_free_txg) {
/*
* If the blkptr isn't set but they have nonzero data,
@@ -368,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -396,8 +428,35 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
} else {
dbuf_evict_user(db);
db->db.db_data = NULL;
- db->db_state = DB_UNCACHED;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ spa_t *spa;
+
+ mutex_exit(&db->db_mtx);
+ DB_GET_SPA(&spa, db);
+ abuf = arc_loan_buf(spa, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ dbuf_set_data(db, NULL);
+ mutex_exit(&db->db_mtx);
}
+ return (abuf);
}
uint64_t
@@ -436,24 +495,26 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, NULL);
+ dbuf_rele_and_unlock(db, NULL);
}
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
arc_buf_t *pbuf;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -461,7 +522,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size);
@@ -471,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -489,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
+ DB_DNODE_EXIT(db);
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -496,17 +559,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
+ spa = dn->dn_objset->os_spa;
+ DB_DNODE_EXIT(db);
+
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_L2CACHE;
- zb.zb_objset = db->db_objset->os_dsl_dataset ?
- db->db_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
+ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -516,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
else
pbuf = db->db_objset->os_phys_buf;
- (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+ (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -530,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
+ dnode_t *dn;
/*
* We don't have to hold the mutex to check db_state because it
@@ -537,46 +602,54 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
ASSERT(!refcount_is_zero(&db->db_holds));
+ if (db->db_state == DB_NOFILL)
+ return (EIO);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
- prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
- if (zio == NULL) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- }
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
dbuf_read_impl(db, zio, &flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
if (!havepzio)
err = zio_wait(zio);
} else {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -600,18 +673,21 @@ static void
dbuf_noread(dmu_buf_impl_t *db)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
+ DB_GET_SPA(&spa, db);
+ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_set_data(db, NULL);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
@@ -643,18 +719,18 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (dr == NULL ||
(dr->dt.dl.dr_data !=
- ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
return;
/*
* If the last dirty record for this dbuf has not yet synced
* and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
+ * reset the reference to point to a new copy,
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -662,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dr->dt.dl.dr_data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -674,22 +752,25 @@ void
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
ASSERT(db->db_level == 0);
- if (db->db_blkid == DB_BONUS_BLKID ||
+ if (db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
return;
+ ASSERT(db->db_data_pending != dr);
+
/* free this block */
- if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
- /* XXX can get silent EIO here */
- (void) dsl_free(NULL,
- spa_get_dsl(db->db_dnode->dn_objset->os_spa),
- txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
+ if (!BP_IS_HOLE(bp)) {
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
@@ -719,7 +800,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
uint64_t first_l1 = start >> epbs;
uint64_t last_l1 = end >> epbs;
- if (end > dn->dn_maxblkid) {
+ if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
end = dn->dn_maxblkid;
last_l1 = end >> epbs;
}
@@ -727,7 +808,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level == 1 &&
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
@@ -755,6 +836,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
db->db_state == DB_EVICTING) {
ASSERT(db->db.db_data == NULL);
mutex_exit(&db->db_mtx);
@@ -782,7 +864,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
* size to reflect that this buffer may
* contain new data when we sync.
*/
- if (db->db_blkid > dn->dn_maxblkid)
+ if (db->db_blkid != DMU_SPILL_BLKID &&
+ db->db_blkid > dn->dn_maxblkid)
dn->dn_maxblkid = db->db_blkid;
dbuf_unoverride(dr);
} else {
@@ -825,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
- /* If we don't exist or are in a snapshot, we can't be freed */
+ /*
+ * If we don't exist or are in a snapshot, we can't be freed.
+ * Don't pass the bp to dsl_dataset_block_freeable() since we
+ * are holding the db_mtx lock and might deadlock if we are
+ * prefetching a dedup-ed block.
+ */
if (birth_txg)
return (ds == NULL ||
- dsl_dataset_block_freeable(ds, birth_txg));
+ dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (FALSE);
}
@@ -839,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -858,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -878,14 +970,36 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);
- dnode_willuse_space(db->db_dnode, size-osize, tx);
+ dnode_willuse_space(dn, size-osize, tx);
+ DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os;
+ zbookmark_t zb;
+
+ DB_GET_OBJSET(&os, db);
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ zb.zb_objset = os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ (void) arc_release_bp(db->db_buf, db,
+ db->db_blkptr, os->os_spa, &zb);
}
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
@@ -895,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
DMU_TX_DIRTY_BUF(tx, db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
@@ -920,7 +1036,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* syncing context don't bother holding ahead.
*/
ASSERT(db->db_level != 0 ||
- db->db_state == DB_CACHED || db->db_state == DB_FILL);
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
mutex_enter(&dn->dn_mtx);
/*
@@ -936,6 +1053,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
mutex_exit(&dn->dn_mtx);
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
/*
* If this buffer is already dirty, we're done.
*/
@@ -945,13 +1065,16 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -979,18 +1102,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* we already dirtied it in open context. Hence we must make
* this assertion only if we're not already dirty.
*/
+ os = dn->dn_objset;
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* Update the accounting.
* Note: we delay "free accounting" until after we drop
* the db_mtx. This keeps us from grabbing other locks
- * (and possibly deadlocking) in bp_get_dasize() while
+ * (and possibly deadlocking) in bp_get_dsize() while
* also holding the db_mtx.
*/
dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1006,22 +1130,26 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level == 0) {
void *data_old = db->db_buf;
- if (db->db_blkid == DB_BONUS_BLKID) {
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db.db_data;
- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
- /*
- * Release the data buffer from the cache so that we
- * can modify it without impacting possible other users
- * of this cached data block. Note that indirect
- * blocks and private objects are not released until the
- * syncing state (since they are only modified then).
- */
- arc_release(db->db_buf, db);
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db_buf;
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
}
- ASSERT(data_old != NULL);
dr->dt.dl.dr_data = data_old;
} else {
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -1039,7 +1167,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* and dbuf_dirty. We win, as though the dbuf_noread() had
* happened after the free.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
@@ -1055,17 +1184,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
} else if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
- bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ bp_get_dsize(os->os_spa, bp) : db->db.db_size;
/*
* This is only a guess -- if the dbuf is dirty
* in a previous txg, we don't know how much
@@ -1074,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* db_blkptr, but since this is just a guess,
* it's OK if we get an odd answer.
*/
+ ddt_prefetch(os->os_spa, bp);
dnode_willuse_space(dn, -willfree, tx);
}
@@ -1097,6 +1229,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
parent = dbuf_hold_level(dn, db->db_level+1,
db->db_blkid >> epbs, FTAG);
+ ASSERT(parent != NULL);
parent_held = TRUE;
}
if (drop_struct_lock)
@@ -1121,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_level+1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL ||
- db->db_parent == db->db_dnode->dn_dbuf);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1132,21 +1264,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
}
static int
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
uint64_t txg = tx->tx_txg;
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
-
/*
* If this buffer is not dirty, we're done.
*/
@@ -1158,6 +1290,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (0);
}
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* If this buffer is currently held, we cannot undirty
@@ -1171,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1192,14 +1329,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
+ DB_DNODE_EXIT(db);
if (db->db_level == 0) {
- dbuf_unoverride(dr);
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ }
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -1214,7 +1355,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
arc_buf_t *buf = db->db_buf;
- ASSERT(arc_released(buf));
+ ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
dbuf_set_data(db, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
dbuf_evict(db);
@@ -1234,18 +1375,30 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
ASSERT(db->db_level == 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1267,7 +1420,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
@@ -1287,8 +1440,7 @@ void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
ASSERT(buf != NULL);
@@ -1311,9 +1463,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ xuio_stat_wbuf_copied();
return;
}
+ xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
@@ -1349,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
* in this case. For callers from the DMU we will usually see:
* dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
* For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
* Sometimes, though, we will get a mix of these two:
* DMU: dbuf_clear()->arc_buf_evict()
* ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1357,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
void
dbuf_clear(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ dmu_buf_impl_t *dndb;
int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1369,7 +1523,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
@@ -1377,16 +1531,32 @@ dbuf_clear(dmu_buf_impl_t *db)
db->db_state = DB_UNCACHED;
}
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
- if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
+ if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
+ } else {
+ DB_DNODE_EXIT(db);
}
if (db->db_buf)
@@ -1396,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
/*
- * If this dbuf is referened from an indirect dbuf,
+ * If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
if (parent && parent != dndb)
@@ -1412,7 +1582,20 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
*bpp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_have_spill &&
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ *bpp = &dn->dn_phys->dn_spill;
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
if (dn->dn_phys->dn_nlevels == 0)
nlevels = 1;
@@ -1461,7 +1644,7 @@ static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
dmu_buf_impl_t *parent, blkptr_t *blkptr)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1475,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkid = blkid;
db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
- db->db_dnode = dn;
+ db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
@@ -1485,16 +1668,20 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_immediate_evict = 0;
db->db_freed_in_flight = 0;
- if (blkid == DB_BONUS_BLKID) {
+ if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- db->db.db_offset = DB_BONUS_BLKID;
+ db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
} else {
int blocksize =
db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
@@ -1528,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
@@ -1562,20 +1750,29 @@ dbuf_destroy(dmu_buf_impl_t *db)
{
ASSERT(refcount_is_zero(&db->db_holds));
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (db->db_dnode) {
- dnode_t *dn = db->db_dnode;
+ if (db->db_dnode_handle != NULL) {
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
-
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold
+ * corresponding to the removed dbuf is no longer
+ * discounted in dnode_move(), so the dnode cannot be
+ * moved until after we release the hold.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
}
dbuf_hash_remove(db);
}
@@ -1598,7 +1795,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (dnode_block_freed(dn, blkid))
@@ -1606,37 +1803,34 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
/* dbuf_find() returns with db_mtx held */
if (db = dbuf_find(dn, 0, blkid)) {
- if (refcount_count(&db->db_holds) > 0) {
- /*
- * This dbuf is active. We assume that it is
- * already CACHED, or else about to be either
- * read or filled.
- */
- mutex_exit(&db->db_mtx);
- return;
- }
+ /*
+ * This dbuf is already in the cache. We assume that
+ * it is already CACHED, or else about to be either
+ * read or filled.
+ */
mutex_exit(&db->db_mtx);
- db = NULL;
+ return;
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
+ ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
arc_buf_t *pbuf;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
- dn->dn_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = dn->dn_object;
- zb.zb_level = 0;
- zb.zb_blkid = blkid;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, 0, blkid);
if (db)
pbuf = db->db_buf;
else
pbuf = dn->dn_objset->os_phys_buf;
- (void) arc_read(NULL, dn->dn_objset->os_spa,
- bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ (void) dsl_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, priority,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -1655,7 +1849,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
{
dmu_buf_impl_t *db, *parent = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);
@@ -1704,7 +1898,7 @@ top:
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
@@ -1713,7 +1907,7 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
@@ -1729,7 +1923,7 @@ top:
if (parent)
dbuf_rele(parent, NULL);
- ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;
@@ -1759,7 +1953,38 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (ENOTSUP);
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ if (blksz > SPA_MAXBLOCKSIZE)
+ blksz = SPA_MAXBLOCKSIZE;
+ else
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dbuf_new_size(db, blksz, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1770,15 +1995,38 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
ASSERT(holds > 1);
}
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
int64_t holds;
- mutex_enter(&db->db_mtx);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
@@ -1794,15 +2042,29 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_evict_user(db);
if (holds == 0) {
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
mutex_exit(&db->db_mtx);
- dnode_rele(db->db_dnode, db);
+
+ /*
+ * If the dnode moves here, we cannot cross this barrier
+ * until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+ DB_DNODE_EXIT(db);
+ /*
+ * The bonus buffer's dnode hold is no longer discounted
+ * in dnode_move(). The dnode cannot move until after
+ * the dnode_rele().
+ */
+ dnode_rele(DB_DNODE(db), db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
* dbuf with any data allocated from the ARC.
*/
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
dbuf_evict(db);
} else if (arc_released(db->db_buf)) {
arc_buf_t *buf = db->db_buf;
@@ -1892,7 +2154,7 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
if (db->db_blkptr)
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
- db->db_blkptr->blk_birth);
+ db->db_blkptr, db->db_blkptr->blk_birth);
return (res);
}
@@ -1906,6 +2168,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (db->db_blkptr != NULL)
return;
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = &dn->dn_phys->dn_spill;
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
/*
* This buffer was allocated at a time when there was
@@ -1941,7 +2208,7 @@ static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
zio_t *zio;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1959,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
}
ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT(db->db_buf != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
db->db_data_pending = dr;
@@ -1982,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2002,23 +2272,34 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
DBUF_VERIFY(db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
/*
* If this is a bonus buffer, simply copy the bonus data into the
* dnode. It will be written out when the dnode is synced (and it
* will be synced, since it must have been dirty for dbuf_sync to
* be called).
*/
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ DB_DNODE_EXIT(db);
+
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2028,6 +2309,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
while (*drp != dr)
drp = &(*drp)->dr_next;
ASSERT(dr->dr_next == NULL);
+ ASSERT(dr->dr_dbuf == db);
*drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
@@ -2036,11 +2318,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
return;
}
+ os = dn->dn_objset;
+
/*
* This function may have dropped the db_mtx lock allowing a dmu_sync
* operation to sneak in. As a result, we need to ensure that we
@@ -2050,7 +2333,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immdiate write,
+ * If this buffer is in the middle of an immediate write,
* wait for the synchronous IO to complete.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2059,43 +2342,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- /*
- * If this dbuf has already been written out via an immediate write,
- * just complete the write by copying over the new block pointer and
- * updating the accounting via the write-completion functions.
- */
- if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- zio_t zio_fake;
-
- zio_fake.io_private = &db;
- zio_fake.io_error = 0;
- zio_fake.io_bp = db->db_blkptr;
- zio_fake.io_bp_orig = *db->db_blkptr;
- zio_fake.io_txg = txg;
- zio_fake.io_flags = 0;
-
- *db->db_blkptr = dr->dt.dl.dr_overridden_by;
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- db->db_data_pending = dr;
- dr->dr_zio = &zio_fake;
- mutex_exit(&db->db_mtx);
-
- ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
- BP_IDENTITY(&zio_fake.io_bp_orig)) ||
- BP_IS_HOLE(zio_fake.io_bp));
-
- if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
- dbuf_write_ready(&zio_fake, db->db_buf, db);
- dbuf_write_done(&zio_fake, db->db_buf, db);
-
- return;
- }
-
- if (dn->dn_object != DMU_META_DNODE_OBJECT &&
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
*datap == db->db_buf) {
/*
* If this buffer is currently "in use" (i.e., there
@@ -2113,8 +2363,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
-
- ASSERT(*datap != NULL);
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
@@ -2122,10 +2370,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- else
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
+ }
}
void
@@ -2154,111 +2412,53 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
}
}
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- dmu_buf_impl_t *parent = db->db_parent;
- uint64_t txg = tx->tx_txg;
- zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
-
- if (!BP_IS_HOLE(db->db_blkptr) &&
- (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(data, db);
- } else {
- ASSERT(arc_released(data));
- /* XXX why do we need to thaw here? */
- arc_buf_thaw(data);
- }
-
- if (parent != dn->dn_dbuf) {
- ASSERT(parent && parent->db_data_pending);
- ASSERT(db->db_level == parent->db_level-1);
- ASSERT(arc_released(parent->db_buf));
- zio = parent->db_data_pending->dr_zio;
- } else {
- ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- zio = dn->dn_zio;
- }
-
- ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
- ASSERT(zio);
-
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
-
- wp.wp_type = dn->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dncompress = dn->dn_compress;
- wp.wp_oscompress = os->os_compress;
- wp.wp_dnchecksum = dn->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
-
- if (BP_IS_OLDER(db->db_blkptr, txg))
- (void) dsl_dataset_block_kill(
- os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
- dr->dr_zio = arc_write(zio, os->os_spa, &wp,
- DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
- data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-}
-
/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
uint64_t fill = 0;
- int old_size, new_size, i;
+ int i;
ASSERT(db->db_blkptr == bp);
- dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
- old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, bp);
-
- dnode_diduse_space(dn, new_size - old_size);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- ASSERT3U(bp->blk_fill, ==, 0);
+ ASSERT(bp->blk_fill == 0);
+ DB_DNODE_EXIT(db);
return;
}
- ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
mutex_enter(&db->db_mtx);
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ }
+#endif
+
if (db->db_level == 0) {
mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID)
dn->dn_phys->dn_maxblkid = db->db_blkid;
mutex_exit(&dn->dn_mtx);
@@ -2281,21 +2481,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill += ibp->blk_fill;
}
}
+ DB_DNODE_EXIT(db);
bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
-
- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
- } else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- dsl_dataset_block_born(ds, bp, tx);
- }
}
/* ARGSUSED */
@@ -2303,34 +2493,70 @@ static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t txg = zio->io_txg;
dbuf_dirty_record_t **drp, *dr;
ASSERT3U(zio->io_error, ==, 0);
+ ASSERT(db->db_blkptr == bp);
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+
+ DB_GET_OBJSET(&os, db);
+ ds = os->os_dsl_dataset;
+ tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
drp = &db->db_last_dirty;
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ DB_DNODE_EXIT(db);
+ }
+#endif
+
if (db->db_level == 0) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ else if (!arc_released(db->db_buf))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
} else {
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2342,6 +2568,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
+ DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
@@ -2351,9 +2578,134 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
mutex_exit(&db->db_mtx);
- dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+ dbuf_write_done(zio, NULL, db);
+}
- dbuf_rele(db, (void *)(uintptr_t)txg);
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ int wp_flag = 0;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ dbuf_release_bp(db);
+ }
+ }
+ }
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ ASSERT(db->db_state != DB_NOFILL);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ dbuf_write_override_ready, dbuf_write_override_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, NULL, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+ dr->dr_zio = arc_write(zio, os->os_spa, txg,
+ db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
new file mode 100644
index 000000000000..0edf62e89c05
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
@@ -0,0 +1,1152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
+TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch);
+SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch,
+ 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+
+ /*
+ * Seed the cached statistics.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+ return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_data != NULL)
+ zio_buf_free(dde->dde_repair_data,
+ DDK_GET_PSIZE(&dde->dde_key));
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT)
+ break;
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ASSERT(error == 0 || error == ENOENT);
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+ for (int i = 0; i < DDT_KEY_WORDS; i++) {
+ if (u1[i] < u2[i])
+ return (-1);
+ if (u1[i] > u2[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object, tx) == 0);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ count += ddt_object_count(ddt, type, class);
+ }
+ }
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (count == 0 && ddt_object_exists(ddt, type, class))
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ zio_t *rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (ENOENT);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
new file mode 100644
index 000000000000..6812aa34cfb7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
@@ -0,0 +1,156 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t one, csize;
+ int error;
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ return (error);
+
+ ASSERT(one == 1);
+ ASSERT(csize <= sizeof (cbuf));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ return (error);
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+ return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+ uint64_t count = 0;
+
+ VERIFY(zap_count(os, object, &count) == 0);
+
+ return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_prefetch,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 26b4e5f5a855..56e284a6d610 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -40,7 +39,10 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/sa.h>
+#ifdef _KERNEL
#include <sys/zfs_znode.h>
+#endif
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -48,8 +50,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint64_array, TRUE, "object array" },
{ byteswap_uint8_array, TRUE, "packed nvlist" },
{ byteswap_uint64_array, TRUE, "packed nvlist size" },
- { byteswap_uint64_array, TRUE, "bplist" },
- { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "bpobj" },
+ { byteswap_uint64_array, TRUE, "bpobj header" },
{ byteswap_uint64_array, TRUE, "SPA space map header" },
{ byteswap_uint64_array, TRUE, "SPA space map" },
{ byteswap_uint64_array, TRUE, "ZIL intent log" },
@@ -81,21 +83,38 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "FUID table" },
{ byteswap_uint64_array, TRUE, "FUID table size" },
{ zap_byteswap, TRUE, "DSL dataset next clones"},
- { zap_byteswap, TRUE, "scrub work queue" },
+ { zap_byteswap, TRUE, "scan work queue" },
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
+ { zap_byteswap, TRUE, "snapshot refcount tags"},
+ { zap_byteswap, TRUE, "DDT ZAP algorithm" },
+ { zap_byteswap, TRUE, "DDT statistics" },
+ { byteswap_uint8_array, TRUE, "System attributes" },
+ { zap_byteswap, TRUE, "SA master node" },
+ { zap_byteswap, TRUE, "SA attr registration" },
+ { zap_byteswap, TRUE, "SA attr layouts" },
+ { zap_byteswap, TRUE, "scan translations" },
+ { byteswap_uint8_array, FALSE, "deduplicated block" },
+ { zap_byteswap, TRUE, "DSL deadlist map" },
+ { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" },
+ { zap_byteswap, TRUE, "DSL dir clones" },
+ { byteswap_uint64_array, TRUE, "bpobj subobj" },
};
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp)
+ void *tag, dmu_buf_t **dbp, int flags)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
blkid = dbuf_whichblock(dn, offset);
@@ -105,7 +124,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
if (db == NULL) {
err = EIO;
} else {
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ err = dbuf_read(db, NULL, db_flags);
if (err) {
dbuf_rele(db, tag);
db = NULL;
@@ -113,7 +132,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
}
dnode_rele(dn, FTAG);
- *dbp = &db->db;
+ *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -124,16 +143,79 @@ dmu_bonus_max(void)
}
int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
- if (newsize < 0 || newsize > db->db_size)
- return (EINVAL);
- dnode_setbonuslen(dn, newsize, tx);
- return (0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = EINVAL;
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (type > DMU_OT_NUMTYPES) {
+ error = EINVAL;
+ } else if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_rm_spill(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
}
/*
@@ -146,7 +228,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dmu_buf_impl_t *db;
int error;
- error = dnode_hold(os->os, object, FTAG, &dn);
+ error = dnode_hold(os, object, FTAG, &dn);
if (error)
return (error);
@@ -158,21 +240,105 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
/* as long as the bonus buf is held, the dnode will be held */
- if (refcount_add(&db->db_holds, tag) == 1)
+ if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
*dbp = &db->db;
return (0);
}
/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ ASSERT(db != NULL);
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else
+ dbuf_rele(db, tag);
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = EINVAL;
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = ENOENT;
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than <os, object> -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
@@ -278,7 +444,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -291,14 +457,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
}
int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
int err;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
return (err);
}
@@ -331,7 +501,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
if (len == 0) { /* they're interested in the bonus buffer */
- dn = os->os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
@@ -348,7 +518,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return;
@@ -480,7 +650,7 @@ dmu_free_long_range(objset_t *os, uint64_t object,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
@@ -495,7 +665,7 @@ dmu_free_object(objset_t *os, uint64_t object)
dmu_tx_t *tx;
int err;
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err != 0)
return (err);
@@ -523,7 +693,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
{
dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
+ int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
ASSERT(offset < UINT64_MAX);
@@ -541,7 +711,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_t **dbp;
int numbufs, err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -634,12 +804,157 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ dmu_buf_t *db = dbp[i];
+
+ dmu_buf_will_not_fill(db, tx);
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+ dmu_xuio_t *priv;
+ uio_t *uio = &xuio->xu_uio;
+
+ uio->uio_iovcnt = nblk;
+ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+ priv->cnt = nblk;
+ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+ priv->iovp = uio->uio_iov;
+ XUIO_XUZC_PRIV(xuio) = priv;
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+ return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int nblk = priv->cnt;
+
+ kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+ kmem_free(priv, sizeof (dmu_xuio_t));
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+ struct iovec *iov;
+ uio_t *uio = &xuio->xu_uio;
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int i = priv->next++;
+
+ ASSERT(i < priv->cnt);
+ ASSERT(off + n <= arc_buf_size(abuf));
+ iov = uio->uio_iov + i;
+ iov->iov_base = (char *)abuf->b_data + off;
+ iov->iov_len = n;
+ priv->bufs[i] = abuf;
+ return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (xuio_ksp != NULL) {
+ xuio_ksp->ks_data = &xuio_stats;
+ kstat_install(xuio_ksp);
+ }
+}
+
+static void
+xuio_stat_fini(void)
+{
+ if (xuio_ksp != NULL) {
+ kstat_delete(xuio_ksp);
+ xuio_ksp = NULL;
+ }
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
#ifdef _KERNEL
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
+ xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -650,6 +965,11 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
if (err)
return (err);
+#ifdef UIO_XUIO
+ if (uio->uio_extflg == UIO_XUIO)
+ xuio = (xuio_t *)uio;
+#endif
+
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
@@ -660,8 +980,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_READ, uio);
+ if (xuio) {
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ arc_buf_t *dbuf_abuf = dbi->db_buf;
+ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+ if (!err) {
+ uio->uio_resid -= tocpy;
+ uio->uio_loffset += tocpy;
+ }
+
+ if (abuf == dbuf_abuf)
+ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+ else
+ XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+ } else {
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ }
if (err)
break;
@@ -672,19 +1008,16 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
return (err);
}
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
- dmu_tx_t *tx)
+static int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
- int numbufs, i;
+ int numbufs;
int err = 0;
+ int i;
- if (size == 0)
- return (0);
-
- err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
- FALSE, FTAG, &numbufs, &dbp);
+ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
@@ -722,11 +1055,52 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
size -= tocpy;
}
+
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
-#ifndef __FreeBSD__
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+#ifdef sun
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
page_t *pp, dmu_tx_t *tx)
@@ -781,8 +1155,8 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
-#endif /* !__FreeBSD__ */
-#endif /* _KERNEL */
+#endif /* sun */
+#endif
/*
* Allocate a loaned anonymous arc buffer.
@@ -790,9 +1164,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ spa_t *spa;
- return (arc_loan_buf(dn->dn_objset->os_spa, size));
+ DB_GET_SPA(&spa, db);
+ return (arc_loan_buf(spa, size));
}
/*
@@ -814,78 +1190,147 @@ void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+ dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_size(buf);
uint64_t blkid;
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(dbuf);
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
+ objset_t *os;
+ uint64_t object;
+
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
+ os = dn->dn_objset;
+ object = dn->dn_object;
+ DB_DNODE_EXIT(dbuf);
+
dbuf_rele(db, FTAG);
- ASSERT(dn->dn_objset->os.os == dn->dn_objset);
- dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
- buf->b_data, tx);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
typedef struct {
- dbuf_dirty_record_t *dr;
- dmu_sync_cb_t *done;
- void *arg;
+ dbuf_dirty_record_t *dsa_dr;
+ dmu_sync_cb_t *dsa_done;
+ zgd_t *dsa_zgd;
+ dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
+ dmu_sync_arg_t *dsa = varg;
+ dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- if (!BP_IS_HOLE(bp)) {
- ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
- ASSERT(BP_GET_LEVEL(bp) == 0);
- bp->blk_fill = 1;
- } else {
- /*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to replay
- */
- BP_SET_LSIZE(bp, db->db.db_size);
+ if (zio->io_error == 0) {
+ if (BP_IS_HOLE(bp)) {
+ /*
+ * A block of zeros may compress to a hole, but the
+ * block size still needs to be known for replay.
+ */
+ BP_SET_LSIZE(bp, db->db_size);
+ } else {
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
}
}
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
+ dmu_sync_arg_t *dsa = varg;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dmu_sync_cb_t *done = in->done;
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
- dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
- if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
- BP_ZERO(&dr->dt.dl.dr_overridden_by);
- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ if (zio->io_error == 0) {
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+ BP_ZERO(&dr->dt.dl.dr_overridden_by);
+ } else {
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ }
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
- if (done)
- done(&(db->db), in->arg);
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *dsa = zio->io_private;
- kmem_free(in, sizeof (dmu_sync_arg_t));
+ if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
+
+ dmu_tx_commit(dsa->dsa_tx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+ zio_prop_t *zp, zbookmark_t *zb)
+{
+ dmu_sync_arg_t *dsa;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+ if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+ dmu_tx_abort(tx);
+ return (EIO); /* Make zl_get_data do txg_waited_synced() */
+ }
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = NULL;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = tx;
+
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+ return (0);
}
/*
@@ -904,157 +1349,112 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
- * EINPROGRESS: the IO has been initiated.
- * The caller should log this blkptr in the callback.
+ * EIO: could not do the I/O.
+ * The caller should do a txg_wait_synced().
*
- * 0: completed. Sets *bp to the blkptr just written.
- * The caller should log this blkptr immediately.
+ * 0: the I/O has been initiated.
+ * The caller should log this blkptr in the done callback.
+ * It is possible that the I/O will fail, in which case
+ * the error will be reported to the done callback and
+ * propagated to pio from zio_done().
*/
int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
- blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- objset_impl_t *os = db->db_objset;
- dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
- tx_state_t *tx = &dp->dp_tx;
+ blkptr_t *bp = zgd->zgd_bp;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
- dmu_sync_arg_t *in;
+ dmu_sync_arg_t *dsa;
zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
- int err;
+ zio_prop_t zp;
+ dnode_t *dn;
+ ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
- dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
- txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
/*
- * XXX - would be nice if we could do this without suspending...
+ * If we're frozen (running ziltest), we always need to generate a bp.
*/
- txg_suspend(dp);
+ if (txg > spa_freeze_txg(os->os_spa))
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
/*
- * If this txg already synced, there's nothing to do.
+ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+ * and us. If we determine that this txg is not yet syncing,
+ * but it begins to sync a moment later, that's OK because the
+ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
- if (txg <= tx->tx_synced_txg) {
- txg_resume(dp);
+ mutex_enter(&db->db_mtx);
+
+ if (txg <= spa_last_synced_txg(os->os_spa)) {
/*
- * If we're running ziltest, we need the blkptr regardless.
+ * This txg has already synced. There's nothing to do.
*/
- if (txg > spa_freeze_txg(dp->dp_spa)) {
- /* if db_blkptr == NULL, this was an empty write */
- if (db->db_blkptr)
- *bp = *db->db_blkptr; /* structure assignment */
- return (0);
- }
+ mutex_exit(&db->db_mtx);
return (EEXIST);
}
- mutex_enter(&db->db_mtx);
-
- if (txg == tx->tx_syncing_txg) {
- while (db->db_data_pending) {
- /*
- * IO is in-progress. Wait for it to finish.
- * XXX - would be nice to be able to somehow "attach"
- * this zio to the parent zio passed in.
- */
- cv_wait(&db->db_changed, &db->db_mtx);
- if (!db->db_data_pending &&
- db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
- /*
- * IO was compressed away
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
- ASSERT(db->db_data_pending ||
- (db->db_blkptr && db->db_blkptr->blk_birth == txg));
- }
-
- if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
- /*
- * IO is already completed.
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
+ if (txg <= spa_syncing_txg(os->os_spa)) {
+ /*
+ * This txg is currently syncing, so we can't mess with
+ * the dirty record anymore; just write a new log block.
+ */
+ mutex_exit(&db->db_mtx);
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
}
dr = db->db_last_dirty;
- while (dr && dr->dr_txg > txg)
+ while (dr && dr->dr_txg != txg)
dr = dr->dr_next;
- if (dr == NULL || dr->dr_txg < txg) {
+
+ if (dr == NULL) {
/*
- * This dbuf isn't dirty, must have been free_range'd.
+ * There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
mutex_exit(&db->db_mtx);
- txg_resume(dp);
return (ENOENT);
}
ASSERT(dr->dr_txg == txg);
- if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
- * We have already issued a sync write for this buffer.
- */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (EALREADY);
- } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- /*
- * This buffer has already been synced. It could not
+ * We have already issued a sync write for this buffer,
+ * or this buffer has already been synced. It could not
* have been dirtied since, or we would have cleared the state.
*/
- *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
+ return (EALREADY);
}
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
- in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
- in->dr = dr;
- in->done = done;
- in->arg = arg;
mutex_exit(&db->db_mtx);
- txg_resume(dp);
- zb.zb_objset = os->os_dsl_dataset->ds_object;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = NULL;
- wp.wp_type = db->db_dnode->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dnchecksum = db->db_dnode->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_dncompress = db->db_dnode->dn_compress;
- wp.wp_oscompress = os->os_compress;
+ zio_nowait(arc_write(pio, os->os_spa, txg,
+ bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dmu_sync_ready, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
- ASSERT(BP_IS_HOLE(bp));
-
- zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
- txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-
- if (pio) {
- zio_nowait(zio);
- err = EINPROGRESS;
- } else {
- err = zio_wait(zio);
- ASSERT(err == 0);
- }
- return (err);
+ return (0);
}
int
@@ -1064,7 +1464,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_blksz(dn, size, ibs, tx);
@@ -1079,7 +1479,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
+ (void) dnode_hold(os, object, FTAG, &dn);
ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
@@ -1093,20 +1493,103 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
+ (void) dnode_hold(os, object, FTAG, &dn);
ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
+int zfs_mdcomp_disable = 0;
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+ boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+ (wp & WP_SPILL));
+ enum zio_checksum checksum = os->os_checksum;
+ enum zio_compress compress = os->os_compress;
+ enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+ boolean_t dedup;
+ boolean_t dedup_verify = os->os_dedup_verify;
+ int copies = os->os_copies;
+
+ /*
+ * Determine checksum setting.
+ */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[checksum].ci_correctable < 1 ||
+ zio_checksum_table[checksum].ci_eck)
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum, checksum);
+ }
+
+ /*
+ * Determine compression setting.
+ */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ compress = zio_compress_select(dn->dn_compress, compress);
+ }
+
+ /*
+ * Determine dedup setting. If we are in dmu_sync(), we won't
+ * actually dedup now because that's all done in syncing context;
+ * but we do want to use the dedup checkum. If the checksum is not
+ * strong enough to ensure unique signatures, force dedup_verify.
+ */
+ dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+ if (dedup) {
+ checksum = dedup_checksum;
+ if (!zio_checksum_table[checksum].ci_dedup)
+ dedup_verify = 1;
+ }
+
+ if (wp & WP_DMU_SYNC)
+ dedup = 0;
+
+ if (wp & WP_NOFILL) {
+ ASSERT(!ismd && level == 0);
+ checksum = ZIO_CHECKSUM_OFF;
+ compress = ZIO_COMPRESS_OFF;
+ dedup = B_FALSE;
+ }
+
+ zp->zp_checksum = checksum;
+ zp->zp_compress = compress;
+ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+ zp->zp_level = level;
+ zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+ zp->zp_dedup = dedup;
+ zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int i, err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
/*
@@ -1120,7 +1603,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
if (i != TXG_SIZE) {
dnode_rele(dn, FTAG);
txg_wait_synced(dmu_objset_pool(os), 0);
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
}
@@ -1134,21 +1617,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
+ dnode_phys_t *dnp;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
+ dnp = dn->dn_phys;
+
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_bonus_size = dn->dn_bonuslen;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
- doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- doi->doi_type = dn->dn_type;
- doi->doi_bonus_size = dn->dn_bonuslen;
- doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
@@ -1162,7 +1651,7 @@ int
dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
{
dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
+ int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -1178,9 +1667,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
- dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
}
/*
@@ -1188,14 +1681,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
* This is specifically optimized for zfs_getattr().
*/
void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add 1 for dnode space */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + 1;
+ DB_DNODE_EXIT(db);
}
void
@@ -1246,8 +1745,12 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
- dbuf_init();
+ zfs_dbgmsg_init();
+ sa_cache_init();
+ xuio_stat_init();
+ dmu_objset_init();
dnode_init();
+ dbuf_init();
zfetch_init();
arc_init();
l2arc_init();
@@ -1256,9 +1759,13 @@ dmu_init(void)
void
dmu_fini(void)
{
+ l2arc_fini();
arc_fini();
zfetch_fini();
- dnode_fini();
dbuf_fini();
- l2arc_fini();
+ dnode_fini();
+ dmu_objset_fini();
+ xuio_stat_fini();
+ sa_cache_fini();
+ zfs_dbgmsg_fini();
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
new file mode 100644
index 000000000000..c72a28ba0cde
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+ struct file *da_fp; /* file to which we are reporting */
+ offset_t *da_offp;
+ int da_err; /* error that stopped diff search */
+ dmu_diff_record_t da_ddr;
+ kthread_t *da_td;
+};
+
+static int
+write_bytes(struct diffarg *da)
+{
+ struct uio auio;
+ struct iovec aiov;
+
+ aiov.iov_base = (caddr_t)&da->da_ddr;
+ aiov.iov_len = sizeof (da->da_ddr);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = da->da_td;
+#ifdef _KERNEL
+ if (da->da_fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ return (EOPNOTSUPP);
+#endif
+}
+
+static int
+write_record(struct diffarg *da)
+{
+
+ if (da->da_ddr.ddr_type == DDR_NONE) {
+ da->da_err = 0;
+ return (0);
+ }
+
+ da->da_err = write_bytes(da);
+ *da->da_offp += sizeof (da->da_ddr);
+ return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+ ASSERT(first <= last);
+ if (da->da_ddr.ddr_type != DDR_FREE ||
+ first != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_FREE;
+ da->da_ddr.ddr_first = first;
+ da->da_ddr.ddr_last = last;
+ return (0);
+ }
+ da->da_ddr.ddr_last = last;
+ return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+ ASSERT(dnp != NULL);
+ if (dnp->dn_type == DMU_OT_NONE)
+ return (report_free_dnode_range(da, object, object));
+
+ if (da->da_ddr.ddr_type != DDR_INUSE ||
+ object != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_INUSE;
+ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+ return (0);
+ }
+ da->da_ddr.ddr_last = object;
+ return (0);
+}
+
+#define DBP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct diffarg *da = arg;
+ int err = 0;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (EINTR);
+
+ if (zb->zb_object != DMU_META_DNODE_OBJECT)
+ return (0);
+
+ if (bp == NULL) {
+ uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+ err = report_free_dnode_range(da, dnobj,
+ dnobj + (span >> DNODE_SHIFT) - 1);
+ if (err)
+ return (err);
+ } else if (zb->zb_level == 0) {
+ dnode_phys_t *blk;
+ arc_buf_t *abuf;
+ uint32_t aflags = ARC_WAIT;
+ int blksz = BP_GET_LSIZE(bp);
+ int i;
+
+ if (dsl_read(NULL, spa, bp, pbuf,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ blk = abuf->b_data;
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = report_dnode(da, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ if (err)
+ return (err);
+ /* Don't care about the data blocks */
+ return (TRAVERSE_VISIT_NO_CHILDREN);
+ }
+ return (0);
+}
+
+int
+dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp, offset_t *offp)
+{
+ struct diffarg da;
+ dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
+ dsl_dataset_t *findds;
+ dsl_dataset_t *relds;
+ int err = 0;
+
+ /* make certain we are looking at snapshots */
+ if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+ return (EINVAL);
+
+ /* fromsnap must be earlier and from the same lineage as tosnap */
+ if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
+ return (EXDEV);
+
+ relds = NULL;
+ findds = ds;
+
+ while (fromds->ds_dir != findds->ds_dir) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (!dsl_dir_is_clone(findds->ds_dir)) {
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+ return (EXDEV);
+ }
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp,
+ findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ if (err)
+ return (EXDEV);
+
+ relds = findds;
+ }
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ da.da_fp = fp;
+ da.da_offp = offp;
+ da.da_ddr.ddr_type = DDR_NONE;
+ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+ da.da_err = 0;
+ da.da_td = curthread;
+
+ err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+ if (err) {
+ da.da_err = err;
+ } else {
+ /* we set the da.da_err we return as side-effect */
+ (void) write_record(&da);
+ }
+
+ return (da.da_err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 1f91fc1ad36f..8dff46048902 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -32,16 +31,15 @@ uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- objset_impl_t *osi = os->os;
uint64_t object;
uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
- (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
int restarted = B_FALSE;
- mutex_enter(&osi->os_obj_lock);
+ mutex_enter(&os->os_obj_lock);
for (;;) {
- object = osi->os_obj_next;
+ object = os->os_obj_next;
/*
* Each time we polish off an L2 bp worth of dnodes
* (2^13 objects), move to another L2 bp that's still
@@ -51,14 +49,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
*/
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
- int error = dnode_next_offset(osi->os_meta_dnode,
+ int error = dnode_next_offset(DMU_META_DNODE(os),
DNODE_FIND_HOLE,
&offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
if (error == 0)
object = offset >> DNODE_SHIFT;
}
- osi->os_obj_next = ++object;
+ os->os_obj_next = ++object;
/*
* XXX We should check for an i/o error here and return
@@ -66,19 +64,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
FTAG, &dn);
if (dn)
break;
if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- osi->os_obj_next = object - 1;
+ os->os_obj_next = object - 1;
}
dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
dnode_rele(dn, FTAG);
- mutex_exit(&osi->os_obj_lock);
+ mutex_exit(&os->os_obj_lock);
dmu_tx_add_new_object(tx, os, object);
return (object);
@@ -94,7 +92,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (EBADF);
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
if (err)
return (err);
dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
@@ -116,7 +114,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (object == DMU_META_DNODE_OBJECT)
return (EBADF);
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err)
return (err);
@@ -128,7 +126,11 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
return (0);
}
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (bonustype == DMU_OT_SA) {
+ nblkptr = 1;
+ } else {
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ }
/*
* If we are losing blkptrs or changing the block size this must
@@ -166,7 +168,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err)
return (err);
@@ -185,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
int error;
- error = dnode_next_offset(os->os->os_meta_dnode,
+ error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 2678b839fda7..09d13db717ee 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/cred.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
@@ -36,22 +37,41 @@
#include <sys/dbuf.h>
#include <sys/zvol.h>
#include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
spa_t *
dmu_objset_spa(objset_t *os)
{
- return (os->os->os_spa);
+ return (os->os_spa);
}
zilog_t *
dmu_objset_zil(objset_t *os)
{
- return (os->os->os_zil);
+ return (os->os_zil);
}
dsl_pool_t *
@@ -59,82 +79,112 @@ dmu_objset_pool(objset_t *os)
{
dsl_dataset_t *ds;
- if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
return (ds->ds_dir->dd_pool);
else
- return (spa_get_dsl(os->os->os_spa));
+ return (spa_get_dsl(os->os_spa));
}
dsl_dataset_t *
dmu_objset_ds(objset_t *os)
{
- return (os->os->os_dsl_dataset);
+ return (os->os_dsl_dataset);
}
dmu_objset_type_t
dmu_objset_type(objset_t *os)
{
- return (os->os->os_phys->os_type);
+ return (os->os_phys->os_type);
}
void
dmu_objset_name(objset_t *os, char *buf)
{
- dsl_dataset_name(os->os->os_dsl_dataset, buf);
+ dsl_dataset_name(os->os_dsl_dataset, buf);
}
uint64_t
dmu_objset_id(objset_t *os)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_syncprop(objset_t *os)
+{
+ return (os->os_sync);
+}
+
+uint64_t
+dmu_objset_logbias(objset_t *os)
+{
+ return (os->os_logbias);
+}
+
static void
checksum_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance should have been done by now.
*/
ASSERT(newval != ZIO_CHECKSUM_INHERIT);
- osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+ os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
}
static void
compression_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval != ZIO_COMPRESS_INHERIT);
- osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+ os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
}
static void
copies_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval > 0);
- ASSERT(newval <= spa_max_replication(osi->os_spa));
+ ASSERT(newval <= spa_max_replication(os->os_spa));
- osi->os_copies = newval;
+ os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+ spa_t *spa = os->os_spa;
+ enum zio_checksum checksum;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
}
static void
primary_cache_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
@@ -142,13 +192,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval)
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
- osi->os_primary_cache = newval;
+ os->os_primary_cache = newval;
}
static void
secondary_cache_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
@@ -156,7 +206,35 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
- osi->os_secondary_cache = newval;
+ os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+ newval == ZFS_SYNC_DISABLED);
+
+ os->os_sync = newval;
+ if (os->os_zil)
+ zil_set_sync(os->os_zil, newval);
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+ newval == ZFS_LOGBIAS_THROUGHPUT);
+ os->os_logbias = newval;
+ if (os->os_zil)
+ zil_set_logbias(os->os_zil, newval);
}
void
@@ -177,39 +255,37 @@ dmu_objset_byteswap(void *buf, size_t size)
int
dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
- objset_impl_t **osip)
+ objset_t **osp)
{
- objset_impl_t *osi;
+ objset_t *os;
int i, err;
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
- osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
- osi->os.os = osi;
- osi->os_dsl_dataset = ds;
- osi->os_spa = spa;
- osi->os_rootbp = bp;
- if (!BP_IS_HOLE(osi->os_rootbp)) {
+ os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+ os->os_dsl_dataset = ds;
+ os->os_spa = spa;
+ os->os_rootbp = bp;
+ if (!BP_IS_HOLE(os->os_rootbp)) {
uint32_t aflags = ARC_WAIT;
zbookmark_t zb;
- zb.zb_objset = ds ? ds->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
- if (DMU_OS_IS_L2CACHEABLE(osi))
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_L2CACHE;
- dprintf_bp(osi->os_rootbp, "reading %s", "");
+ dprintf_bp(os->os_rootbp, "reading %s", "");
/*
- * NB: when bprewrite scrub can change the bp,
+ * XXX when bprewrite scrub can change the bp,
* and this is called from dmu_objset_open_ds_os, the bp
* could change, and we'll need a lock.
*/
- err = arc_read_nolock(NULL, spa, osi->os_rootbp,
- arc_getbuf_func, &osi->os_phys_buf,
+ err = dsl_read_nolock(NULL, spa, os->os_rootbp,
+ arc_getbuf_func, &os->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
- kmem_free(osi, sizeof (objset_impl_t));
+ kmem_free(os, sizeof (objset_t));
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = EIO;
@@ -218,27 +294,27 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
- arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
+ arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_buf_alloc(spa,
- sizeof (objset_phys_t), &osi->os_phys_buf,
+ sizeof (objset_phys_t), &os->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
- bcopy(osi->os_phys_buf->b_data, buf->b_data,
- arc_buf_size(osi->os_phys_buf));
- (void) arc_buf_remove_ref(osi->os_phys_buf,
- &osi->os_phys_buf);
- osi->os_phys_buf = buf;
+ bcopy(os->os_phys_buf->b_data, buf->b_data,
+ arc_buf_size(os->os_phys_buf));
+ (void) arc_buf_remove_ref(os->os_phys_buf,
+ &os->os_phys_buf);
+ os->os_phys_buf = buf;
}
- osi->os_phys = osi->os_phys_buf->b_data;
- osi->os_flags = osi->os_phys->os_flags;
+ os->os_phys = os->os_phys_buf->b_data;
+ os->os_flags = os->os_phys->os_flags;
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
- osi->os_phys_buf = arc_buf_alloc(spa, size,
- &osi->os_phys_buf, ARC_BUFC_METADATA);
- osi->os_phys = osi->os_phys_buf->b_data;
- bzero(osi->os_phys, size);
+ os->os_phys_buf = arc_buf_alloc(spa, size,
+ &os->os_phys_buf, ARC_BUFC_METADATA);
+ os->os_phys = os->os_phys_buf->b_data;
+ bzero(os->os_phys, size);
}
/*
@@ -249,61 +325,78 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
*/
if (ds) {
err = dsl_prop_register(ds, "primarycache",
- primary_cache_changed_cb, osi);
+ primary_cache_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "secondarycache",
- secondary_cache_changed_cb, osi);
+ secondary_cache_changed_cb, os);
if (!dsl_dataset_is_snapshot(ds)) {
if (err == 0)
err = dsl_prop_register(ds, "checksum",
- checksum_changed_cb, osi);
+ checksum_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
+ compression_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "copies",
- copies_changed_cb, osi);
+ copies_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "dedup",
+ dedup_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "logbias",
+ logbias_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "sync",
+ sync_changed_cb, os);
}
if (err) {
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
- &osi->os_phys_buf) == 1);
- kmem_free(osi, sizeof (objset_impl_t));
+ VERIFY(arc_buf_remove_ref(os->os_phys_buf,
+ &os->os_phys_buf) == 1);
+ kmem_free(os, sizeof (objset_t));
return (err);
}
} else if (ds == NULL) {
/* It's the meta-objset. */
- osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_compress = ZIO_COMPRESS_LZJB;
- osi->os_copies = spa_max_replication(spa);
- osi->os_primary_cache = ZFS_CACHE_ALL;
- osi->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ os->os_compress = ZIO_COMPRESS_LZJB;
+ os->os_copies = spa_max_replication(spa);
+ os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+ os->os_dedup_verify = 0;
+ os->os_logbias = 0;
+ os->os_sync = 0;
+ os->os_primary_cache = ZFS_CACHE_ALL;
+ os->os_secondary_cache = ZFS_CACHE_ALL;
}
- osi->os_zil_header = osi->os_phys->os_zil_header;
- osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
+ if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+ os->os_zil_header = os->os_phys->os_zil_header;
+ os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
- list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[i]));
- list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[i]));
}
- list_create(&osi->os_dnodes, sizeof (dnode_t),
+ list_create(&os->os_dnodes, sizeof (dnode_t),
offsetof(dnode_t, dn_link));
- list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
- mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
-
- osi->os_meta_dnode = dnode_special_open(osi,
- &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
- if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
- osi->os_userused_dnode = dnode_special_open(osi,
- &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
- osi->os_groupused_dnode = dnode_special_open(osi,
- &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+ mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ DMU_META_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+ &os->os_meta_dnode);
+ if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
+ DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+ &os->os_userused_dnode);
+ DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+ &os->os_groupused_dnode);
}
/*
@@ -311,117 +404,96 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* have ds_opening_lock
*/
if (ds) {
- VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
- dmu_objset_evict));
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
}
- *osip = osi;
+ *osp = os;
return (0);
}
-static int
-dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
{
- objset_impl_t *osi;
+ int err = 0;
mutex_enter(&ds->ds_opening_lock);
- osi = dsl_dataset_get_user_ptr(ds);
- if (osi == NULL) {
- int err;
-
+ *osp = ds->ds_objset;
+ if (*osp == NULL) {
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, &osi);
- if (err) {
- mutex_exit(&ds->ds_opening_lock);
- return (err);
- }
+ ds, dsl_dataset_get_blkptr(ds), osp);
}
mutex_exit(&ds->ds_opening_lock);
-
- os->os = osi;
- os->os_mode = DS_MODE_NOHOLD;
-
- if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
- return (EINVAL);
- return (0);
+ return (err);
}
+/* called from zpl */
int
-dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
{
- objset_t *os;
+ dsl_dataset_t *ds;
int err;
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- err = dmu_objset_open_ds_os(ds, os, type);
+ err = dsl_dataset_hold(name, tag, &ds);
if (err)
- kmem_free(os, sizeof (objset_t));
- else
- *osp = os;
+ return (err);
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err)
+ dsl_dataset_rele(ds, tag);
+
return (err);
}
/* called from zpl */
int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp)
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
{
- objset_t *os;
dsl_dataset_t *ds;
int err;
- ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
- DS_MODE_TYPE(mode) == DS_MODE_OWNER);
-
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- if (DS_MODE_TYPE(mode) == DS_MODE_USER)
- err = dsl_dataset_hold(name, os, &ds);
- else
- err = dsl_dataset_own(name, mode, os, &ds);
- if (err) {
- kmem_free(os, sizeof (objset_t));
+ err = dsl_dataset_own(name, B_FALSE, tag, &ds);
+ if (err)
return (err);
- }
- err = dmu_objset_open_ds_os(ds, os, type);
+ err = dmu_objset_from_ds(ds, osp);
if (err) {
- if (DS_MODE_TYPE(mode) == DS_MODE_USER)
- dsl_dataset_rele(ds, os);
- else
- dsl_dataset_disown(ds, os);
- kmem_free(os, sizeof (objset_t));
- } else {
- os->os_mode = mode;
- *osp = os;
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dmu_objset_disown(*osp, tag);
+ return (EINVAL);
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dmu_objset_disown(*osp, tag);
+ return (EROFS);
}
return (err);
}
void
-dmu_objset_close(objset_t *os)
+dmu_objset_rele(objset_t *os, void *tag)
{
- ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
- DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
- DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+ dsl_dataset_rele(os->os_dsl_dataset, tag);
+}
- if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
- dsl_dataset_rele(os->os->os_dsl_dataset, os);
- else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
- dsl_dataset_disown(os->os->os_dsl_dataset, os);
- kmem_free(os, sizeof (objset_t));
+void
+dmu_objset_disown(objset_t *os, void *tag)
+{
+ dsl_dataset_disown(os->os_dsl_dataset, tag);
}
int
dmu_objset_evict_dbufs(objset_t *os)
{
- objset_impl_t *osi = os->os;
dnode_t *dn;
- mutex_enter(&osi->os_lock);
+ mutex_enter(&os->os_lock);
/* process the mdn last, since the other dnodes have holds on it */
- list_remove(&osi->os_dnodes, osi->os_meta_dnode);
- list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+ list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+ list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
/*
* Find the first dnode with holds. We have to do this dance
@@ -429,93 +501,114 @@ dmu_objset_evict_dbufs(objset_t *os)
* hold. If there are no holds then it has no dbufs so OK to
* skip.
*/
- for (dn = list_head(&osi->os_dnodes);
+ for (dn = list_head(&os->os_dnodes);
dn && !dnode_add_ref(dn, FTAG);
- dn = list_next(&osi->os_dnodes, dn))
+ dn = list_next(&os->os_dnodes, dn))
continue;
while (dn) {
dnode_t *next_dn = dn;
do {
- next_dn = list_next(&osi->os_dnodes, next_dn);
+ next_dn = list_next(&os->os_dnodes, next_dn);
} while (next_dn && !dnode_add_ref(next_dn, FTAG));
- mutex_exit(&osi->os_lock);
+ mutex_exit(&os->os_lock);
dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
- mutex_enter(&osi->os_lock);
+ mutex_enter(&os->os_lock);
dn = next_dn;
}
- mutex_exit(&osi->os_lock);
- return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
+ dn = list_head(&os->os_dnodes);
+ mutex_exit(&os->os_lock);
+ return (dn != DMU_META_DNODE(os));
}
void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+dmu_objset_evict(objset_t *os)
{
- objset_impl_t *osi = arg;
- objset_t os;
- int i;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
- ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
- }
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
if (!dsl_dataset_is_snapshot(ds)) {
VERIFY(0 == dsl_prop_unregister(ds, "checksum",
- checksum_changed_cb, osi));
+ checksum_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "compression",
- compression_changed_cb, osi));
+ compression_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "copies",
- copies_changed_cb, osi));
+ copies_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+ dedup_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+ logbias_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "sync",
+ sync_changed_cb, os));
}
VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
- primary_cache_changed_cb, osi));
+ primary_cache_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
- secondary_cache_changed_cb, osi));
+ secondary_cache_changed_cb, os));
}
+ if (os->os_sa)
+ sa_tear_down(os);
+
/*
* We should need only a single pass over the dnode list, since
* nothing can be added to the list at this point.
*/
- os.os = osi;
- (void) dmu_objset_evict_dbufs(&os);
+ (void) dmu_objset_evict_dbufs(os);
- dnode_special_close(osi->os_meta_dnode);
- if (osi->os_userused_dnode) {
- dnode_special_close(osi->os_userused_dnode);
- dnode_special_close(osi->os_groupused_dnode);
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
}
- zil_free(osi->os_zil);
+ zil_free(os->os_zil);
+
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+ VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
- ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
+ mutex_destroy(&os->os_lock);
+ mutex_destroy(&os->os_obj_lock);
+ mutex_destroy(&os->os_user_ptr_lock);
+ kmem_free(os, sizeof (objset_t));
+}
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
- mutex_destroy(&osi->os_lock);
- mutex_destroy(&osi->os_obj_lock);
- mutex_destroy(&osi->os_user_ptr_lock);
- kmem_free(osi, sizeof (objset_impl_t));
+timestruc_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+ return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
}
/* called from dsl for meta-objset */
-objset_impl_t *
+objset_t *
dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, dmu_tx_t *tx)
{
- objset_impl_t *osi;
+ objset_t *os;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
- if (ds)
- mutex_enter(&ds->ds_opening_lock);
- VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
- if (ds)
- mutex_exit(&ds->ds_opening_lock);
- mdn = osi->os_meta_dnode;
+ if (ds != NULL)
+ VERIFY(0 == dmu_objset_from_ds(ds, &os));
+ else
+ VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+
+ mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -550,24 +643,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_NONE);
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
- osi->os_phys->os_type = type;
- if (dmu_objset_userused_enabled(osi)) {
- osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
- osi->os_flags = osi->os_phys->os_flags;
+ os->os_phys->os_type = type;
+ if (dmu_objset_userused_enabled(os)) {
+ os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_flags = os->os_phys->os_flags;
}
dsl_dataset_dirty(ds, tx);
- return (osi);
+ return (os);
}
struct oscarg {
void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
void *userarg;
- dsl_dataset_t *clone_parent;
+ dsl_dataset_t *clone_origin;
const char *lastname;
dmu_objset_type_t type;
uint64_t flags;
+ cred_t *cr;
};
/*ARGSUSED*/
@@ -585,17 +679,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (err != ENOENT)
return (err ? err : EEXIST);
- if (oa->clone_parent != NULL) {
- /*
- * You can't clone across pools.
- */
- if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+ if (oa->clone_origin != NULL) {
+ /* You can't clone across pools. */
+ if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
return (EXDEV);
- /*
- * You can only clone snapshots, not the head datasets.
- */
- if (oa->clone_parent->ds_phys->ds_num_children == 0)
+ /* You can only clone snapshots, not the head datasets. */
+ if (!dsl_dataset_is_snapshot(oa->clone_origin))
return (EINVAL);
}
@@ -603,41 +693,40 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
+ spa_t *spa = dd->dd_pool->dp_spa;
struct oscarg *oa = arg2;
- dsl_dataset_t *ds;
- blkptr_t *bp;
- uint64_t dsobj;
+ uint64_t obj;
ASSERT(dmu_tx_is_syncing(tx));
- dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_parent, oa->flags, cr, tx);
+ obj = dsl_dataset_create_sync(dd, oa->lastname,
+ oa->clone_origin, oa->flags, oa->cr, tx);
+
+ if (oa->clone_origin == NULL) {
+ dsl_pool_t *dp = dd->dd_pool;
+ dsl_dataset_t *ds;
+ blkptr_t *bp;
+ objset_t *os;
- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
- bp = dsl_dataset_get_blkptr(ds);
- if (BP_IS_HOLE(bp)) {
- objset_impl_t *osi;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+ bp = dsl_dataset_get_blkptr(ds);
+ ASSERT(BP_IS_HOLE(bp));
- /* This is an empty dmu_objset; not a clone. */
- osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, bp, oa->type, tx);
+ os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
if (oa->userfunc)
- oa->userfunc(&osi->os, oa->userarg, cr, tx);
+ oa->userfunc(os, oa->userarg, oa->cr, tx);
+ dsl_dataset_rele(ds, FTAG);
}
- spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dsobj);
-
- dsl_dataset_rele(ds, FTAG);
+ spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj);
}
int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
{
dsl_dir_t *pdd;
@@ -654,24 +743,13 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
return (EEXIST);
}
- dprintf("name=%s\n", name);
-
oa.userfunc = func;
oa.userarg = arg;
oa.lastname = tail;
oa.type = type;
oa.flags = flags;
+ oa.cr = CRED();
- if (clone_parent != NULL) {
- /*
- * You can't clone to a different type.
- */
- if (clone_parent->os->os_phys->os_type != type) {
- dsl_dir_close(pdd, FTAG);
- return (EINVAL);
- }
- oa.clone_parent = clone_parent->os->os_dsl_dataset;
- }
err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
dmu_objset_create_sync, pdd, &oa, 5);
dsl_dir_close(pdd, FTAG);
@@ -679,67 +757,59 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
}
int
-dmu_objset_destroy(const char *name)
+dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
{
- objset_t *os;
- int error;
-
- /*
- * If it looks like we'll be able to destroy it, and there's
- * an unplayed replay log sitting around, destroy the log.
- * It would be nicer to do this in dsl_dataset_destroy_sync(),
- * but the replay log objset is modified in open context.
- */
- error = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
- if (error == 0) {
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- zil_destroy(dmu_objset_zil(os), B_FALSE);
+ dsl_dir_t *pdd;
+ const char *tail;
+ int err = 0;
+ struct oscarg oa = { 0 };
- error = dsl_dataset_destroy(ds, os);
- /*
- * dsl_dataset_destroy() closes the ds.
- */
- kmem_free(os, sizeof (objset_t));
+ ASSERT(strchr(name, '@') == NULL);
+ err = dsl_dir_open(name, FTAG, &pdd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ dsl_dir_close(pdd, FTAG);
+ return (EEXIST);
}
- return (error);
+ oa.lastname = tail;
+ oa.clone_origin = clone_origin;
+ oa.flags = flags;
+ oa.cr = CRED();
+
+ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+ dmu_objset_create_sync, pdd, &oa, 5);
+ dsl_dir_close(pdd, FTAG);
+ return (err);
}
-/*
- * This will close the objset.
- */
int
-dmu_objset_rollback(objset_t *os)
+dmu_objset_destroy(const char *name, boolean_t defer)
{
- int err;
dsl_dataset_t *ds;
+ int error;
- ds = os->os->os_dsl_dataset;
-
- if (!dsl_dataset_tryown(ds, TRUE, os)) {
- dmu_objset_close(os);
- return (EBUSY);
+ error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_dataset_destroy(ds, FTAG, defer);
+ /* dsl_dataset_destroy() closes the ds. */
}
- err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
-
- /*
- * NB: we close the objset manually because the rollback
- * actually implicitly called dmu_objset_evict(), thus freeing
- * the objset_impl_t.
- */
- dsl_dataset_disown(ds, os);
- kmem_free(os, sizeof (objset_t));
- return (err);
+ return (error);
}
struct snaparg {
dsl_sync_task_group_t *dstg;
char *snapname;
+ char *htag;
char failed[MAXPATHLEN];
- boolean_t checkperms;
+ boolean_t recursive;
+ boolean_t needsuspend;
+ boolean_t temporary;
nvlist_t *props;
+ struct dsl_ds_holdarg *ha; /* only needed in the temporary case */
+ dsl_dataset_t *newds;
};
static int
@@ -747,77 +817,137 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
struct snaparg *sn = arg2;
+ int error;
/* The props have already been checked by zfs_check_userprops(). */
- return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
- sn->snapname, tx));
+ error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
+ sn->snapname, tx);
+ if (error)
+ return (error);
+
+ if (sn->temporary) {
+ /*
+ * Ideally we would just call
+ * dsl_dataset_user_hold_check() and
+ * dsl_dataset_destroy_check() here. However the
+ * dataset we want to hold and destroy is the snapshot
+ * that we just confirmed we can create, but it won't
+ * exist until after these checks are run. Do any
+ * checks we can here and if more checks are added to
+ * those routines in the future, similar checks may be
+ * necessary here.
+ */
+ if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ /*
+ * Not checking number of tags because the tag will be
+ * unique, as it will be the only tag.
+ */
+ if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ return (E2BIG);
+
+ sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ sn->ha->temphold = B_TRUE;
+ sn->ha->htag = sn->htag;
+ }
+ return (error);
}
static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
struct snaparg *sn = arg2;
- dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+ dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
+
+ if (sn->props) {
+ dsl_props_arg_t pa;
+ pa.pa_props = sn->props;
+ pa.pa_source = ZPROP_SRC_LOCAL;
+ dsl_props_set_sync(ds->ds_prev, &pa, tx);
+ }
+
+ if (sn->temporary) {
+ struct dsl_ds_destroyarg da;
+
+ dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx);
+ kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg));
+ sn->ha = NULL;
+ sn->newds = ds->ds_prev;
- if (sn->props)
- dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
+ da.ds = ds->ds_prev;
+ da.defer = B_TRUE;
+ dsl_dataset_destroy_sync(&da, FTAG, tx);
+ }
}
static int
-dmu_objset_snapshot_one(char *name, void *arg)
+dmu_objset_snapshot_one(const char *name, void *arg)
{
struct snaparg *sn = arg;
objset_t *os;
int err;
+ char *cp;
+
+ /*
+ * If the objset starts with a '%', then ignore it unless it was
+ * explicitly named (ie, not recursive). These hidden datasets
+ * are always inconsistent, and by not opening them here, we can
+ * avoid a race with dsl_dir_destroy_check().
+ */
+ cp = strrchr(name, '/');
+ if (cp && cp[1] == '%' && sn->recursive)
+ return (0);
(void) strcpy(sn->failed, name);
/*
- * Check permissions only when requested. This only applies when
- * doing a recursive snapshot. The permission checks for the starting
- * dataset have already been performed in zfs_secpolicy_snapshot()
+ * Check permissions if we are doing a recursive snapshot. The
+ * permission checks for the starting dataset have already been
+ * performed in zfs_secpolicy_snapshot()
*/
- if (sn->checkperms == B_TRUE &&
- (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+ if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
return (err);
- err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
+ err = dmu_objset_hold(name, sn, &os);
if (err != 0)
return (err);
- /* If the objset is in an inconsistent state, return busy */
- if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
- dmu_objset_close(os);
- return (EBUSY);
- }
-
/*
- * NB: we need to wait for all in-flight changes to get to disk,
- * so that we snapshot those changes. zil_suspend does this as
- * a side effect.
+ * If the objset is in an inconsistent state (eg, in the process
+ * of being destroyed), don't snapshot it. As with %hidden
+ * datasets, we return EBUSY if this name was explicitly
+ * requested (ie, not recursive), and otherwise ignore it.
*/
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0) {
- dsl_sync_task_create(sn->dstg, snapshot_check,
- snapshot_sync, os, sn, 3);
- } else {
- dmu_objset_close(os);
+ if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
+ dmu_objset_rele(os, sn);
+ return (sn->recursive ? 0 : EBUSY);
}
- return (err);
+ if (sn->needsuspend) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err) {
+ dmu_objset_rele(os, sn);
+ return (err);
+ }
+ }
+ dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync,
+ os, sn, 3);
+
+ return (0);
}
int
-dmu_objset_snapshot(char *fsname, char *snapname,
- nvlist_t *props, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd)
{
dsl_sync_task_t *dst;
struct snaparg sn;
spa_t *spa;
+ minor_t minor;
int err;
(void) strcpy(sn.failed, fsname);
@@ -826,16 +956,31 @@ dmu_objset_snapshot(char *fsname, char *snapname,
if (err)
return (err);
+ if (temporary) {
+ if (cleanup_fd < 0) {
+ spa_close(spa, FTAG);
+ return (EINVAL);
+ }
+ if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
+ spa_close(spa, FTAG);
+ return (err);
+ }
+ }
+
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
+ sn.htag = tag;
sn.props = props;
+ sn.recursive = recursive;
+ sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ sn.temporary = temporary;
+ sn.ha = NULL;
+ sn.newds = NULL;
if (recursive) {
- sn.checkperms = B_TRUE;
err = dmu_objset_find(fsname,
dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
} else {
- sn.checkperms = B_FALSE;
err = dmu_objset_snapshot_one(fsname, &sn);
}
@@ -845,15 +990,33 @@ dmu_objset_snapshot(char *fsname, char *snapname,
for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
objset_t *os = dst->dst_arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- if (dst->dst_err)
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ if (dst->dst_err) {
dsl_dataset_name(ds, sn.failed);
- zil_resume(dmu_objset_zil(os));
- dmu_objset_close(os);
+ } else if (temporary) {
+ dsl_register_onexit_hold_cleanup(sn.newds, tag, minor);
+ }
+ if (sn.needsuspend)
+ zil_resume(dmu_objset_zil(os));
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) {
+ char name[MAXNAMELEN];
+
+ dmu_objset_name(os, name);
+ strlcat(name, "@", sizeof(name));
+ strlcat(name, snapname, sizeof(name));
+ zvol_create_minors(name);
+ }
+#endif
+#endif
+ dmu_objset_rele(os, &sn);
}
if (err)
(void) strcpy(fsname, sn.failed);
+ if (temporary)
+ zfs_onexit_fd_rele(cleanup_fd);
dsl_sync_task_group_destroy(sn.dstg);
spa_close(spa, FTAG);
return (err);
@@ -888,11 +1051,10 @@ dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
/* ARGSUSED */
static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
blkptr_t *bp = zio->io_bp;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- objset_impl_t *os = arg;
+ objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(bp == os->os_rootbp);
@@ -908,24 +1070,34 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, zio, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
/* called from dsl */
void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_t zb;
- writeprops_t wp = { 0 };
+ zio_prop_t zp;
zio_t *zio;
list_t *list;
list_t *newlist = NULL;
@@ -949,42 +1121,33 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
/*
* Create the root block IO
*/
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1; /* for block ordering; it's level 0 on disk */
- zb.zb_blkid = 0;
-
- wp.wp_type = DMU_OT_OBJSET;
- wp.wp_level = 0; /* on-disk BP level; see above */
- wp.wp_copies = os->os_copies;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_oscompress = os->os_compress;
-
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- os->os_rootbp, pio, tx);
- }
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf,
+ os->os_rootbp, os->os_spa, &zb));
- arc_release(os->os_phys_buf, &os->os_phys_buf);
+ dmu_write_policy(os, NULL, 0, 0, &zp);
- zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+ zio = arc_write(pio, os->os_spa, tx->tx_txg,
+ os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+ dmu_objset_write_ready, dmu_objset_write_done, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync special dnodes - the parent IO for the sync is the root block
*/
- os->os_meta_dnode->dn_zio = zio;
- dnode_sync(os->os_meta_dnode, tx);
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
os->os_phys->os_flags = os->os_flags;
- if (os->os_userused_dnode &&
- os->os_userused_dnode->dn_type != DMU_OT_NONE) {
- os->os_userused_dnode->dn_zio = zio;
- dnode_sync(os->os_userused_dnode, tx);
- os->os_groupused_dnode->dn_zio = zio;
- dnode_sync(os->os_groupused_dnode, tx);
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
txgoff = tx->tx_txg & TXG_MASK;
@@ -1002,7 +1165,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
- list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
ASSERT(dr->dr_dbuf->db_level == 0);
list_remove(list, dr);
@@ -1017,6 +1180,22 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
zio_nowait(zio);
}
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+ return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+ !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
+boolean_t
+dmu_objset_is_dirty_anywhere(objset_t *os)
+{
+ for (int t = 0; t < TXG_SIZE; t++)
+ if (dmu_objset_is_dirty(os, t))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
@@ -1026,74 +1205,86 @@ dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
}
boolean_t
-dmu_objset_userused_enabled(objset_impl_t *os)
+dmu_objset_userused_enabled(objset_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
- used_cbs[os->os_phys->os_type] &&
- os->os_userused_dnode);
+ used_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
+}
+
+static void
+do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
+ uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+{
+ if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+ int64_t delta = DNODE_SIZE + used;
+ if (subtract)
+ delta = -delta;
+ VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
+ user, delta, tx));
+ VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+ group, delta, tx));
+ }
}
void
-dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
- static const char zerobuf[DN_MAX_BONUSLEN] = {0};
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
while (dn = list_head(list)) {
- dmu_object_type_t bonustype;
-
+ int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
- ASSERT(dn->dn_oldphys);
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
/* Allocate the user/groupused objects if necessary. */
- if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
- VERIFY(0 == zap_create_claim(&os->os,
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+ VERIFY(0 == zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
- VERIFY(0 == zap_create_claim(&os->os,
+ VERIFY(0 == zap_create_claim(os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
/*
- * If the object was not previously
- * accounted, pretend that it was free.
+ * We intentionally modify the zap object even if the
+ * net delta is zero. Otherwise
+ * the block of the zap obj could be shared between
+ * datasets but need to be different between them after
+ * a bprewrite.
*/
- if (!(dn->dn_oldphys->dn_flags &
- DNODE_FLAG_USERUSED_ACCOUNTED)) {
- bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
- }
- /*
- * If the object was freed, use the previous bonustype.
- */
- bonustype = dn->dn_phys->dn_bonustype ?
- dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
- ASSERT(dn->dn_phys->dn_type != 0 ||
- (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
- DN_MAX_BONUSLEN) == 0 &&
- DN_USED_BYTES(dn->dn_phys) == 0));
- ASSERT(dn->dn_oldphys->dn_type != 0 ||
- (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
- DN_MAX_BONUSLEN) == 0 &&
- DN_USED_BYTES(dn->dn_oldphys) == 0));
- used_cbs[os->os_phys->os_type](&os->os, bonustype,
- DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
- DN_USED_BYTES(dn->dn_oldphys),
- DN_USED_BYTES(dn->dn_phys), tx);
+ flags = dn->dn_id_flags;
+ ASSERT(flags);
+ if (flags & DN_ID_OLD_EXIST) {
+ do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+ }
+ if (flags & DN_ID_NEW_EXIST) {
+ do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+ dn->dn_phys->dn_flags, dn->dn_newuid,
+ dn->dn_newgid, B_FALSE, tx);
+ }
- /*
- * The mutex is needed here for interlock with dnode_allocate.
- */
mutex_enter(&dn->dn_mtx);
- zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
- dn->dn_oldphys = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ dn->dn_olduid = dn->dn_newuid;
+ dn->dn_oldgid = dn->dn_newgid;
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (dn->dn_bonuslen == 0)
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ else
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
mutex_exit(&dn->dn_mtx);
list_remove(list, dn);
@@ -1101,10 +1292,151 @@ dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
}
}
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned. In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr, **drp;
+ void *data;
+
+ if (db->db_dirtycnt == 0)
+ return (db->db.db_data); /* Nothing is changing */
+
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg == tx->tx_txg)
+ break;
+
+ if (dr == NULL) {
+ data = NULL;
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(dr->dr_dbuf);
+ dn = DB_DNODE(dr->dr_dbuf);
+
+ if (dn->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+
+ DB_DNODE_EXIT(dr->dr_dbuf);
+ }
+
+ return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ void *data = NULL;
+ dmu_buf_impl_t *db = NULL;
+ uint64_t *user, *group;
+ int flags = dn->dn_id_flags;
+ int error;
+ boolean_t have_spill = B_FALSE;
+
+ if (!dmu_objset_userused_enabled(dn->dn_objset))
+ return;
+
+ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+ DN_ID_CHKED_SPILL)))
+ return;
+
+ if (before && dn->dn_bonuslen != 0)
+ data = DN_BONUS(dn->dn_phys);
+ else if (!before && dn->dn_bonuslen != 0) {
+ if (dn->dn_bonus) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ data = dmu_objset_userquota_find_data(db, tx);
+ } else {
+ data = DN_BONUS(dn->dn_phys);
+ }
+ } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+ int rf = 0;
+
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ error = dmu_spill_hold_by_dnode(dn,
+ rf | DB_RF_MUST_SUCCEED,
+ FTAG, (dmu_buf_t **)&db);
+ ASSERT(error == 0);
+ mutex_enter(&db->db_mtx);
+ data = (before) ? db->db.db_data :
+ dmu_objset_userquota_find_data(db, tx);
+ have_spill = B_TRUE;
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+
+ if (before) {
+ ASSERT(data);
+ user = &dn->dn_olduid;
+ group = &dn->dn_oldgid;
+ } else if (data) {
+ user = &dn->dn_newuid;
+ group = &dn->dn_newgid;
+ }
+
+ /*
+ * Must always call the callback in case the object
+ * type has changed and that type isn't an object type to track
+ */
+ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+ user, group);
+
+ /*
+ * Preserve existing uid/gid when the callback can't determine
+ * what the new uid/gid are and the callback returned EEXIST.
+ * The EEXIST error tells us to just use the existing uid/gid.
+ * If we don't know what the old values are then just assign
+ * them to 0, since that is a new file being created.
+ */
+ if (!before && data == NULL && error == EEXIST) {
+ if (flags & DN_ID_OLD_EXIST) {
+ dn->dn_newuid = dn->dn_olduid;
+ dn->dn_newgid = dn->dn_oldgid;
+ } else {
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ }
+ error = 0;
+ }
+
+ if (db)
+ mutex_exit(&db->db_mtx);
+
+ mutex_enter(&dn->dn_mtx);
+ if (error == 0 && before)
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (error == 0 && !before)
+ dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+ if (have_spill) {
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ } else {
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (have_spill)
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
boolean_t
dmu_objset_userspace_present(objset_t *os)
{
- return (os->os->os_phys->os_flags &
+ return (os->os_phys->os_flags &
OBJSET_FLAG_USERACCOUNTING_COMPLETE);
}
@@ -1116,7 +1448,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
if (dmu_objset_userspace_present(os))
return (0);
- if (!dmu_objset_userused_enabled(os->os))
+ if (!dmu_objset_userused_enabled(os))
return (ENOTSUP);
if (dmu_objset_is_snapshot(os))
return (EINVAL);
@@ -1152,7 +1484,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
dmu_tx_commit(tx);
}
- os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
@@ -1161,35 +1493,35 @@ void
dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
- dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+ dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
usedobjsp, availobjsp);
}
uint64_t
dmu_objset_fsid_guid(objset_t *os)
{
- return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+ return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
}
void
dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
{
- stat->dds_type = os->os->os_phys->os_type;
- if (os->os->os_dsl_dataset)
- dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+ stat->dds_type = os->os_phys->os_type;
+ if (os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
}
void
dmu_objset_stats(objset_t *os, nvlist_t *nv)
{
- ASSERT(os->os->os_dsl_dataset ||
- os->os->os_phys->os_type == DMU_OST_META);
+ ASSERT(os->os_dsl_dataset ||
+ os->os_phys->os_type == DMU_OST_META);
- if (os->os->os_dsl_dataset != NULL)
- dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+ if (os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os_dsl_dataset, nv);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
- os->os->os_phys->os_type);
+ os->os_phys->os_type);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
dmu_objset_userspace_present(os));
}
@@ -1197,8 +1529,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
int
dmu_objset_is_snapshot(objset_t *os)
{
- if (os->os->os_dsl_dataset != NULL)
- return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ if (os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
else
return (B_FALSE);
}
@@ -1207,7 +1539,7 @@ int
dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
boolean_t *conflict)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
uint64_t ignored;
if (ds->ds_phys->ds_snapnames_zapobj == 0)
@@ -1222,7 +1554,7 @@ int
dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
zap_cursor_t cursor;
zap_attribute_t attr;
@@ -1259,12 +1591,12 @@ int
dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp)
{
- dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
zap_cursor_t cursor;
zap_attribute_t attr;
/* there is no next dir on a snapshot! */
- if (os->os->os_dsl_dataset->ds_object !=
+ if (os->os_dsl_dataset->ds_object !=
dd->dd_phys->dd_head_dataset_obj)
return (ENOENT);
@@ -1293,7 +1625,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
}
struct findarg {
- int (*func)(char *, void *);
+ int (*func)(const char *, void *);
void *arg;
};
@@ -1302,7 +1634,7 @@ static int
findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
{
struct findarg *fa = arg;
- return (fa->func((char *)dsname, fa->arg));
+ return (fa->func(dsname, fa->arg));
}
/*
@@ -1310,7 +1642,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
* Perhaps change all callers to use dmu_objset_find_spa()?
*/
int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
+ int flags)
{
struct findarg fa;
fa.func = func;
@@ -1361,12 +1694,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
ASSERT(attr->za_integer_length == sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strcpy(child, name);
- (void) strcat(child, "/");
- (void) strcat(child, attr->za_name);
+ child = kmem_asprintf("%s/%s", name, attr->za_name);
err = dmu_objset_find_spa(spa, child, func, arg, flags);
- kmem_free(child, MAXPATHLEN);
+ strfree(child);
if (err)
break;
}
@@ -1400,13 +1730,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strcpy(child, name);
- (void) strcat(child, "@");
- (void) strcat(child, attr->za_name);
+ child = kmem_asprintf("%s@%s",
+ name, attr->za_name);
err = func(spa, attr->za_first_integer,
child, arg);
- kmem_free(child, MAXPATHLEN);
+ strfree(child);
if (err)
break;
}
@@ -1429,7 +1757,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
/* ARGSUSED */
int
-dmu_objset_prefetch(char *name, void *arg)
+dmu_objset_prefetch(const char *name, void *arg)
{
dsl_dataset_t *ds;
@@ -1438,16 +1766,14 @@ dmu_objset_prefetch(char *name, void *arg)
if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
mutex_enter(&ds->ds_opening_lock);
- if (!dsl_dataset_get_user_ptr(ds)) {
+ if (ds->ds_objset == NULL) {
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = ds->ds_object;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
+ SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+ ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+ (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds),
&ds->ds_phys->ds_bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
@@ -1463,13 +1789,13 @@ dmu_objset_prefetch(char *name, void *arg)
void
dmu_objset_set_user(objset_t *os, void *user_ptr)
{
- ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
- os->os->os_user_ptr = user_ptr;
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ os->os_user_ptr = user_ptr;
}
void *
dmu_objset_get_user(objset_t *os)
{
- ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
- return (os->os->os_user_ptr);
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ return (os->os_user_ptr);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index ed5afb4e1df5..55451fd931ad 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -33,14 +32,32 @@
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
static char *dmu_recv_tag = "dmu_recv_tag";
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another. Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+ PENDING_NONE,
+ PENDING_FREE,
+ PENDING_FREEOBJECTS
+} pendop_t;
+
struct backuparg {
dmu_replay_record_t *drr;
kthread_t *td;
@@ -48,7 +65,9 @@ struct backuparg {
offset_t *off;
objset_t *os;
zio_cksum_t zc;
+ uint64_t toguid;
int err;
+ pendop_t pending_op;
};
static int
@@ -56,11 +75,9 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
{
struct uio auio;
struct iovec aiov;
-
ASSERT3U(len % 8, ==, 0);
fletcher_4_incremental_native(buf, len, &ba->zc);
-
aiov.iov_base = buf;
aiov.iov_len = len;
auio.uio_iov = &aiov;
@@ -79,7 +96,6 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
ba->err = EOPNOTSUPP;
#endif
*ba->off += len;
-
return (ba->err);
}
@@ -87,29 +103,120 @@ static int
dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
uint64_t length)
{
- /* write a FREE record */
+ struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREE, push it out,
+ * since free block aggregation can only be done for blocks of the
+ * same type (i.e., DRR_FREE records can only be aggregated with
+ * other DRR_FREE records. DRR_FREEOBJECTS records can only be
+ * aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
+ if (ba->pending_op == PENDING_FREE) {
+ /*
+ * There should never be a PENDING_FREE if length is -1
+ * (because dump_dnode is the only place where this
+ * function is called with a -1, and only after flushing
+ * any pending record).
+ */
+ ASSERT(length != -1ULL);
+ /*
+ * Check to see whether this free block can be aggregated
+ * with pending one.
+ */
+ if (drrf->drr_object == object && drrf->drr_offset +
+ drrf->drr_length == offset) {
+ drrf->drr_length += length;
+ return (0);
+ } else {
+ /* not a continuation. Push out pending record */
+ if (dump_bytes(ba, ba->drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ }
+ /* create a FREE record and make it pending */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_FREE;
- ba->drr->drr_u.drr_free.drr_object = object;
- ba->drr->drr_u.drr_free.drr_offset = offset;
- ba->drr->drr_u.drr_free.drr_length = length;
+ drrf->drr_object = object;
+ drrf->drr_offset = offset;
+ drrf->drr_length = length;
+ drrf->drr_toguid = ba->toguid;
+ if (length == -1ULL) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ } else {
+ ba->pending_op = PENDING_FREE;
+ }
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
return (0);
}
static int
dump_data(struct backuparg *ba, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, void *data)
+ uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
+ struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
+
+
+ /*
+ * If there is any kind of pending aggregation (currently either
+ * a grouping of free objects or free blocks), push it out to
+ * the stream, since aggregation can't be done across operations
+ * of different types.
+ */
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
/* write a DATA record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_WRITE;
- ba->drr->drr_u.drr_write.drr_object = object;
- ba->drr->drr_u.drr_write.drr_type = type;
- ba->drr->drr_u.drr_write.drr_offset = offset;
- ba->drr->drr_u.drr_write.drr_length = blksz;
+ drrw->drr_object = object;
+ drrw->drr_type = type;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = ba->toguid;
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
+{
+ struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
+
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
+ /* write a SPILL record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_SPILL;
+ drrs->drr_object = object;
+ drrs->drr_length = blksz;
+ drrs->drr_toguid = ba->toguid;
if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
return (EINTR);
@@ -121,39 +228,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type,
static int
dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
{
+ struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+ * push it out, since free block aggregation can only be done for
+ * blocks of the same type (i.e., DRR_FREE records can only be
+ * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
+ * can only be aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (ba->pending_op != PENDING_NONE &&
+ ba->pending_op != PENDING_FREEOBJECTS) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ if (ba->pending_op == PENDING_FREEOBJECTS) {
+ /*
+ * See whether this free object array can be aggregated
+ * with pending one
+ */
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+ drrfo->drr_numobjs += numobjs;
+ return (0);
+ } else {
+ /* can't be aggregated. Push out pending record */
+ if (dump_bytes(ba, ba->drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ }
+
/* write a FREEOBJECTS record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_FREEOBJECTS;
- ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
- ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+ drrfo->drr_firstobj = firstobj;
+ drrfo->drr_numobjs = numobjs;
+ drrfo->drr_toguid = ba->toguid;
+
+ ba->pending_op = PENDING_FREEOBJECTS;
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
return (0);
}
static int
dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
{
+ struct drr_object *drro = &(ba->drr->drr_u.drr_object);
+
if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
return (dump_freeobjects(ba, object, 1));
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
/* write an OBJECT record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_OBJECT;
- ba->drr->drr_u.drr_object.drr_object = object;
- ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
- ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
- ba->drr->drr_u.drr_object.drr_blksz =
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
- ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
- ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
- ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ drro->drr_object = object;
+ drro->drr_type = dnp->dn_type;
+ drro->drr_bonustype = dnp->dn_bonustype;
+ drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_checksumtype = dnp->dn_checksum;
+ drro->drr_compress = dnp->dn_compress;
+ drro->drr_toguid = ba->toguid;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
- if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+ if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
return (EINTR);
/* free anything past the end of the file */
@@ -169,9 +317,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/* ARGSUSED */
static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct backuparg *ba = arg;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -180,9 +329,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
- if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+ if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+ DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
- } else if (bp == NULL && zb->zb_object == 0) {
+ } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@@ -198,7 +348,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
- if (arc_read_nolock(NULL, spa, bp,
+ if (dsl_read(NULL, spa, bp, pbuf,
arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
@@ -212,7 +362,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
- } else { /* it's a level-0 block of a regular object */
+ } else if (type == DMU_OT_SA) {
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -222,8 +372,20 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
+ err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ } else { /* it's a level-0 block of a regular object */
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ int blksz = BP_GET_LSIZE(bp);
+
+ if (dsl_read(NULL, spa, bp, pbuf,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, abuf->b_data);
+ blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -235,8 +397,8 @@ int
dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
struct file *fp, offset_t *off)
{
- dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
- dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+ dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
dmu_replay_record_t *drr;
struct backuparg ba;
int err;
@@ -273,10 +435,25 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
drr->drr_type = DRR_BEGIN;
drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
+ DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+ DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+ if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
+ uint64_t version;
+ if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0)
+ return (EINVAL);
+ if (version == ZPL_VERSION_SA) {
+ DMU_SET_FEATUREFLAGS(
+ drr->drr_u.drr_begin.drr_versioninfo,
+ DMU_BACKUP_FEATURE_SA_SPILL);
+ }
+ }
+#endif
+
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
- drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
if (fromorigin)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
@@ -297,9 +474,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
ba.fp = fp;
ba.os = tosnap;
ba.off = off;
+ ba.toguid = ds->ds_phys->ds_guid;
ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+ ba.pending_op = PENDING_NONE;
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
kmem_free(drr, sizeof (dmu_replay_record_t));
return (ba.err);
}
@@ -307,6 +486,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
backup_cb, &ba);
+ if (ba.pending_op != PENDING_NONE)
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
+ err = EINTR;
+
if (err) {
if (err == EINTR && ba.err)
err = ba.err;
@@ -317,8 +500,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
bzero(drr, sizeof (dmu_replay_record_t));
drr->drr_type = DRR_END;
drr->drr_u.drr_end.drr_checksum = ba.zc;
+ drr->drr_u.drr_end.drr_toguid = ba.toguid;
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
kmem_free(drr, sizeof (dmu_replay_record_t));
return (ba.err);
}
@@ -339,33 +523,12 @@ struct recvbeginsyncarg {
uint64_t dsflags;
char clonelastname[MAXNAMELEN];
dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
+ cred_t *cr;
};
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
- cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
-
- /* This should always work, since we just created it */
- /* XXX - create should return an owned ds */
- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
- DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
-
- if (type != DMU_OST_NONE) {
- (void) dmu_objset_create_impl(dp->dp_spa,
- ds, &ds->ds_phys->ds_bp, type, tx);
- }
-
- spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
-
- return (ds);
-}
-
/* ARGSUSED */
static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -383,7 +546,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* make sure it's a snap in the same pool */
if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
+ if (!dsl_dataset_is_snapshot(rbsa->origin))
return (EINVAL);
if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
return (ENODEV);
@@ -393,77 +556,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct recvbeginsyncarg *rbsa = arg2;
uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
+ /* Create and open new dataset. */
dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
- rbsa->origin, flags, cr, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
+ rbsa->origin, flags, rbsa->cr, tx);
+ VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
+ B_TRUE, dmu_recv_tag, &rbsa->ds));
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- int err;
-
- /* must be a head ds */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /* must not be a clone ds */
- if (dsl_dir_is_clone(ds->ds_dir))
- return (EINVAL);
-
- err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
- if (err)
- return (err);
-
- if (rbsa->origin) {
- /* make sure it's a snap in the same pool */
- if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
- return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
- return (EINVAL);
- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
+ if (rbsa->origin == NULL) {
+ (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
+ rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
}
- return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- dsl_dir_t *dd = ds->ds_dir;
- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
- uint64_t dsobj;
-
- /*
- * NB: caller must provide an extra hold on the dsl_dir_t, so it
- * won't go away when dsl_dataset_destroy_sync() closes the
- * dataset.
- */
- dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
-
- dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+ spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
+ dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
}
/* ARGSUSED */
static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -474,77 +591,105 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
return (ETXTBSY);
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
-
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
-
- /* temporary clone name must not exist */
+ /* new snapshot name must not exist */
err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_dir->dd_phys->dd_child_dir_zapobj,
- rbsa->clonelastname, 8, 1, &val);
+ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
if (err == 0)
return (EEXIST);
if (err != ENOENT)
return (err);
- /* new snapshot name must not exist */
+ if (rbsa->fromguid) {
+ /* if incremental, most recent snapshot must match fromguid */
+ if (ds->ds_prev == NULL)
+ return (ENODEV);
+
+ /*
+ * most recent snapshot must match fromguid, or there are no
+ * changes since the fromguid one
+ */
+ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+ uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
+ uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
+ while (obj != 0) {
+ dsl_dataset_t *snap;
+ err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ obj, FTAG, &snap);
+ if (err)
+ return (ENODEV);
+ if (snap->ds_phys->ds_creation_txg < birth) {
+ dsl_dataset_rele(snap, FTAG);
+ return (ENODEV);
+ }
+ if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+ dsl_dataset_rele(snap, FTAG);
+ break; /* it's ok */
+ }
+ obj = snap->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ }
+ if (obj == 0)
+ return (ENODEV);
+ }
+ } else {
+ /* if full, most recent snapshot must be $ORIGIN */
+ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+ return (ENODEV);
+ }
+
+ /* temporary clone name must not exist */
err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+ ds->ds_dir->dd_phys->dd_child_dir_zapobj,
+ rbsa->clonelastname, 8, 1, &val);
if (err == 0)
return (EEXIST);
if (err != ENOENT)
return (err);
+
return (0);
}
/* ARGSUSED */
static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ohds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
dsl_pool_t *dp = ohds->ds_dir->dd_pool;
- dsl_dataset_t *ods, *cds;
+ dsl_dataset_t *cds;
uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
- /* create the temporary clone */
- VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
- FTAG, &ods));
- dsobj = dsl_dataset_create_sync(ohds->ds_dir,
- rbsa->clonelastname, ods, flags, cr, tx);
- dsl_dataset_rele(ods, FTAG);
-
- /* open the temporary clone */
- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
- DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
+ /* create and open the temporary clone */
+ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
+ ohds->ds_prev, flags, rbsa->cr, tx);
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
- /* copy the refquota from the target fs to the clone */
- if (ohds->ds_quota > 0)
- dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
+ /*
+ * If we actually created a non-clone, we need to create the
+ * objset in our new dataset.
+ */
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+ }
rbsa->ds = cds;
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+ spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
+ dp->dp_spa, tx, "dataset = %lld", dsobj);
}
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+static boolean_t
+dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
{
- dsl_dataset_t *ds = arg1;
+ int featureflags;
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
- ds->ds_object);
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
}
/*
@@ -552,13 +697,13 @@ recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
{
int err = 0;
boolean_t byteswap;
- struct recvbeginsyncarg rbsa;
- uint64_t version;
+ struct recvbeginsyncarg rbsa = { 0 };
+ uint64_t versioninfo;
int flags;
dsl_dataset_t *ds;
@@ -571,22 +716,23 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
rbsa.tofs = tofs;
rbsa.tosnap = tosnap;
- rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+ rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
rbsa.fromguid = drrb->drr_fromguid;
rbsa.type = drrb->drr_type;
rbsa.tag = FTAG;
rbsa.dsflags = 0;
- version = drrb->drr_version;
+ rbsa.cr = CRED();
+ versioninfo = drrb->drr_versioninfo;
flags = drrb->drr_flags;
if (byteswap) {
rbsa.type = BSWAP_32(rbsa.type);
rbsa.fromguid = BSWAP_64(rbsa.fromguid);
- version = BSWAP_64(version);
+ versioninfo = BSWAP_64(versioninfo);
flags = BSWAP_32(flags);
}
- if (version != DMU_BACKUP_STREAM_VERSION ||
+ if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
rbsa.type >= DMU_OST_NUMTYPES ||
((flags & DRR_FLAG_CLONE) && origin == NULL))
return (EINVAL);
@@ -597,102 +743,81 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
bzero(drc, sizeof (dmu_recv_cookie_t));
drc->drc_drrb = drrb;
drc->drc_tosnap = tosnap;
+ drc->drc_top_ds = top_ds;
drc->drc_force = force;
/*
* Process the begin in syncing context.
*/
- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
- /* offline incremental receive */
- err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
- if (err)
- return (err);
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- rbsa.fromguid) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ /* open the dataset we are logically receiving into */
+ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+ if (err == 0) {
+ if (dmu_recv_verify_features(ds, drrb)) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (ENOTSUP);
}
- rbsa.force = B_FALSE;
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_offline_incremental_sync, ds, &rbsa, 1);
- if (err) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (err);
+ /* target fs already exists; recv into temp clone */
+
+ /* Can't recv a clone into an existing fs */
+ if (flags & DRR_FLAG_CLONE) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (EINVAL);
+ }
+
+ /* must not have an incremental recv already in progress */
+ if (!mutex_tryenter(&ds->ds_recvlock)) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (EBUSY);
}
- drc->drc_logical_ds = drc->drc_real_ds = ds;
- } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
- /* online incremental receive */
/* tmp clone name is: tofs/%tosnap" */
(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
"%%%s", tosnap);
-
- /* open the dataset we are logically receiving into */
- err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
- if (err)
- return (err);
-
rbsa.force = force;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_online_incremental_sync, ds, &rbsa, 5);
+ recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
if (err) {
+ mutex_exit(&ds->ds_recvlock);
dsl_dataset_rele(ds, dmu_recv_tag);
return (err);
}
drc->drc_logical_ds = ds;
drc->drc_real_ds = rbsa.ds;
- } else {
- /* create new fs -- full backup or clone */
- dsl_dir_t *dd = NULL;
- const char *tail;
+ } else if (err == ENOENT) {
+ /* target fs does not exist; must be a full backup or clone */
+ char *cp;
- err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+ /*
+ * If it's a non-clone incremental, we are missing the
+ * target fs, so fail the recv.
+ */
+ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
+ return (ENOENT);
+
+ /* Open the parent of tofs */
+ cp = strrchr(tofs, '/');
+ *cp = '\0';
+ err = dsl_dataset_hold(tofs, FTAG, &ds);
+ *cp = '/';
if (err)
return (err);
- if (tail == NULL) {
- if (!force) {
- dsl_dir_close(dd, FTAG);
- return (EEXIST);
- }
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- err = dsl_dataset_own_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj,
- DS_MODE_INCONSISTENT, FTAG, &ds);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- if (err) {
- dsl_dir_close(dd, FTAG);
- return (err);
- }
- dsl_dataset_make_exclusive(ds, FTAG);
- err = dsl_sync_task_do(dd->dd_pool,
- recv_full_existing_check,
- recv_full_existing_sync, ds, &rbsa, 5);
- dsl_dataset_disown(ds, FTAG);
- } else {
- err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
- recv_full_sync, dd, &rbsa, 5);
+ if (dmu_recv_verify_features(ds, drrb)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOTSUP);
}
- dsl_dir_close(dd, FTAG);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+ dsl_dataset_rele(ds, FTAG);
if (err)
return (err);
drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
drc->drc_newfs = B_TRUE;
}
- return (0);
+ return (err);
}
struct restorearg {
@@ -704,10 +829,100 @@ struct restorearg {
uint64_t voff;
int bufsize; /* amount of memory allocated for buf */
zio_cksum_t cksum;
+ avl_tree_t *guid_to_ds_map;
};
+typedef struct guid_map_entry {
+ uint64_t guid;
+ dsl_dataset_t *gme_ds;
+ avl_node_t avlnode;
+} guid_map_entry_t;
+
static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
+guid_compare(const void *arg1, const void *arg2)
+{
+ const guid_map_entry_t *gmep1 = arg1;
+ const guid_map_entry_t *gmep2 = arg2;
+
+ if (gmep1->guid < gmep2->guid)
+ return (-1);
+ else if (gmep1->guid > gmep2->guid)
+ return (1);
+ return (0);
+}
+
+/*
+ * This function is a callback used by dmu_objset_find() (which
+ * enumerates the object sets) to build an avl tree that maps guids
+ * to datasets. The resulting table is used when processing DRR_WRITE_BYREF
+ * send stream records. These records, which are used in dedup'ed
+ * streams, do not contain data themselves, but refer to a copy
+ * of the data block that has already been written because it was
+ * earlier in the stream. That previous copy is identified by the
+ * guid of the dataset with the referenced data.
+ */
+int
+find_ds_by_guid(const char *name, void *arg)
+{
+ avl_tree_t *guid_map = arg;
+ dsl_dataset_t *ds, *snapds;
+ guid_map_entry_t *gmep;
+ dsl_pool_t *dp;
+ int err;
+ uint64_t lastobj, firstobj;
+
+ if (dsl_dataset_hold(name, FTAG, &ds) != 0)
+ return (0);
+
+ dp = ds->ds_dir->dd_pool;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
+ lastobj = ds->ds_phys->ds_prev_snap_obj;
+
+ while (lastobj != firstobj) {
+ err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
+ if (err) {
+ /*
+ * Skip this snapshot and move on. It's not
+ * clear why this would ever happen, but the
+ * remainder of the snapshot streadm can be
+ * processed.
+ */
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
+ gmep->guid = snapds->ds_phys->ds_guid;
+ gmep->gme_ds = snapds;
+ avl_add(guid_map, gmep);
+ lastobj = snapds->ds_phys->ds_prev_snap_obj;
+ }
+
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (0);
+}
+
+static void
+free_guid_map_onexit(void *arg)
+{
+ avl_tree_t *ca = arg;
+ void *cookie = NULL;
+ guid_map_entry_t *gmep;
+
+ while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+ dsl_dataset_rele(gmep->gme_ds, ca);
+ kmem_free(gmep, sizeof (guid_map_entry_t));
+ }
+ avl_destroy(ca);
+ kmem_free(ca, sizeof (avl_tree_t));
+}
+
+static int
+restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
{
struct uio auio;
struct iovec aiov;
@@ -742,7 +957,7 @@ restore_read(struct restorearg *ra, int len)
ASSERT3U(len % 8, ==, 0);
while (done < len) {
- int resid;
+ ssize_t resid;
ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
len - done, ra->voff, &resid);
@@ -774,7 +989,7 @@ backup_byteswap(dmu_replay_record_t *drr)
switch (drr->drr_type) {
case DRR_BEGIN:
DO64(drr_begin.drr_magic);
- DO64(drr_begin.drr_version);
+ DO64(drr_begin.drr_versioninfo);
DO64(drr_begin.drr_creation_time);
DO32(drr_begin.drr_type);
DO32(drr_begin.drr_flags);
@@ -788,27 +1003,56 @@ backup_byteswap(dmu_replay_record_t *drr)
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
DO32(drr_object.drr_bonuslen);
+ DO64(drr_object.drr_toguid);
break;
case DRR_FREEOBJECTS:
DO64(drr_freeobjects.drr_firstobj);
DO64(drr_freeobjects.drr_numobjs);
+ DO64(drr_freeobjects.drr_toguid);
break;
case DRR_WRITE:
DO64(drr_write.drr_object);
DO32(drr_write.drr_type);
DO64(drr_write.drr_offset);
DO64(drr_write.drr_length);
+ DO64(drr_write.drr_toguid);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+ DO64(drr_write.drr_key.ddk_prop);
+ break;
+ case DRR_WRITE_BYREF:
+ DO64(drr_write_byref.drr_object);
+ DO64(drr_write_byref.drr_offset);
+ DO64(drr_write_byref.drr_length);
+ DO64(drr_write_byref.drr_toguid);
+ DO64(drr_write_byref.drr_refguid);
+ DO64(drr_write_byref.drr_refobject);
+ DO64(drr_write_byref.drr_refoffset);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+ DO64(drr_write_byref.drr_key.ddk_prop);
break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
DO64(drr_free.drr_length);
+ DO64(drr_free.drr_toguid);
+ break;
+ case DRR_SPILL:
+ DO64(drr_spill.drr_object);
+ DO64(drr_spill.drr_length);
+ DO64(drr_spill.drr_toguid);
break;
case DRR_END:
DO64(drr_end.drr_checksum.zc_word[0]);
DO64(drr_end.drr_checksum.zc_word[1]);
DO64(drr_end.drr_checksum.zc_word[2]);
DO64(drr_end.drr_checksum.zc_word[3]);
+ DO64(drr_end.drr_toguid);
break;
}
#undef DO64
@@ -825,7 +1069,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
if (drro->drr_type == DMU_OT_NONE ||
drro->drr_type >= DMU_OT_NUMTYPES ||
drro->drr_bonustype >= DMU_OT_NUMTYPES ||
- drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
@@ -864,8 +1108,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen);
}
- if (err)
+ if (err) {
return (EINVAL);
+ }
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, drro->drr_object);
@@ -875,7 +1120,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
return (err);
}
- dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
+ tx);
dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
if (data != NULL) {
@@ -957,6 +1203,114 @@ restore_write(struct restorearg *ra, objset_t *os,
return (0);
}
+/*
+ * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream. This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+restore_write_byref(struct restorearg *ra, objset_t *os,
+ struct drr_write_byref *drrwbr)
+{
+ dmu_tx_t *tx;
+ int err;
+ guid_map_entry_t gmesrch;
+ guid_map_entry_t *gmep;
+ avl_index_t where;
+ objset_t *ref_os = NULL;
+ dmu_buf_t *dbp;
+
+ if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+ return (EINVAL);
+
+ /*
+ * If the GUID of the referenced dataset is different from the
+ * GUID of the target dataset, find the referenced dataset.
+ */
+ if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+ gmesrch.guid = drrwbr->drr_refguid;
+ if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+ &where)) == NULL) {
+ return (EINVAL);
+ }
+ if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+ return (EINVAL);
+ } else {
+ ref_os = os;
+ }
+
+ if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+ return (err);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_write(os, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+ dmu_buf_rele(dbp, FTAG);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+{
+ dmu_tx_t *tx;
+ void *data;
+ dmu_buf_t *db, *db_spill;
+ int err;
+
+ if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+ drrs->drr_length > SPA_MAXBLOCKSIZE)
+ return (EINVAL);
+
+ data = restore_read(ra, drrs->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+ if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_spill(tx, db->db_object);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_buf_will_dirty(db_spill, tx);
+
+ if (db_spill->db_size < drrs->drr_length)
+ VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ drrs->drr_length, tx));
+ bcopy(data, db_spill->db_data, drrs->drr_length);
+
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
/* ARGSUSED */
static int
restore_free(struct restorearg *ra, objset_t *os,
@@ -976,37 +1330,18 @@ restore_free(struct restorearg *ra, objset_t *os,
return (err);
}
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
- if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
- /*
- * online incremental or new fs: destroy the fs (which
- * may be a clone) that we created
- */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
- if (drc->drc_real_ds != drc->drc_logical_ds)
- dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
- } else {
- /*
- * offline incremental: rollback to most recent snapshot.
- */
- (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
- dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
- }
-}
-
/*
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
-dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep)
{
- kthread_t *td = curthread;
struct restorearg ra = { 0 };
dmu_replay_record_t *drr;
objset_t *os;
zio_cksum_t pcksum;
+ int featureflags;
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
ra.byteswap = TRUE;
@@ -1031,30 +1366,69 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
if (ra.byteswap) {
struct drr_begin *drrb = drc->drc_drrb;
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
drrb->drr_type = BSWAP_32(drrb->drr_type);
drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
}
- ra.td = td;
+ ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
ra.bufsize = 1<<20;
ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
- ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+ ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+ DMU_SUBSTREAM);
ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
/*
* Open the objset we are modifying.
*/
- VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
+ VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+ featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+ /* if this stream is dedup'ed, set up the avl tree for guid mapping */
+ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+ minor_t minor;
+
+ if (cleanup_fd == -1) {
+ ra.err = EBADF;
+ goto out;
+ }
+ ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (ra.err) {
+ cleanup_fd = -1;
+ goto out;
+ }
+
+ if (*action_handlep == 0) {
+ ra.guid_to_ds_map =
+ kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(ra.guid_to_ds_map, guid_compare,
+ sizeof (guid_map_entry_t),
+ offsetof(guid_map_entry_t, avlnode));
+ (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+ (void *)ra.guid_to_ds_map,
+ DS_FIND_CHILDREN);
+ ra.err = zfs_onexit_add_cb(minor,
+ free_guid_map_onexit, ra.guid_to_ds_map,
+ action_handlep);
+ if (ra.err)
+ goto out;
+ } else {
+ ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+ (void **)&ra.guid_to_ds_map);
+ if (ra.err)
+ goto out;
+ }
+ }
+
/*
* Read records and process them.
*/
@@ -1094,6 +1468,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ra.err = restore_write(&ra, os, &drrw);
break;
}
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref drrwbr =
+ drr->drr_u.drr_write_byref;
+ ra.err = restore_write_byref(&ra, os, &drrwbr);
+ break;
+ }
case DRR_FREE:
{
struct drr_free drrf = drr->drr_u.drr_free;
@@ -1112,6 +1493,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ra.err = ECKSUM;
goto out;
}
+ case DRR_SPILL:
+ {
+ struct drr_spill drrs = drr->drr_u.drr_spill;
+ ra.err = restore_spill(&ra, os, &drrs);
+ break;
+ }
default:
ra.err = EINVAL;
goto out;
@@ -1121,15 +1508,22 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ASSERT(ra.err != 0);
out:
- dmu_objset_close(os);
+ if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+ zfs_onexit_fd_rele(cleanup_fd);
if (ra.err != 0) {
/*
- * rollback or destroy what we created, so we don't
- * leave it in the restoring state.
+ * destroy what we created, so we don't leave it in the
+ * inconsistent restoring state.
*/
txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
- dmu_recv_abort_cleanup(drc);
+
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ if (drc->drc_real_ds != drc->drc_logical_ds) {
+ mutex_exit(&drc->drc_logical_ds->ds_recvlock);
+ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+ }
}
kmem_free(ra.buf, ra.bufsize);
@@ -1153,12 +1547,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct recvendsyncarg *resa = arg2;
- dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+ dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
/* set snapshot's creation time and guid */
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
@@ -1170,35 +1564,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
}
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
{
struct recvendsyncarg resa;
dsl_dataset_t *ds = drc->drc_logical_ds;
int err;
/*
- * XXX hack; seems the ds is still dirty and
- * dsl_pool_zil_clean() expects it to have a ds_user_ptr
- * (and zil), but clone_swap() can close it.
+ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+ * expects it to have a ds_user_ptr (and zil), but clone_swap()
+ * can close it.
*/
txg_wait_synced(ds->ds_dir->dd_pool, 0);
- if (ds != drc->drc_real_ds) {
- /* we are doing an online recv */
- if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
- err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
- drc->drc_force);
- if (err)
- dsl_dataset_disown(ds, dmu_recv_tag);
- } else {
- err = EBUSY;
- dsl_dataset_rele(ds, dmu_recv_tag);
- }
- /* dsl_dataset_destroy() will disown the ds */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+ err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+ drc->drc_force);
if (err)
- return (err);
+ goto out;
+ } else {
+ mutex_exit(&ds->ds_recvlock);
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ return (EBUSY);
}
resa.creation_time = drc->drc_drrb->drr_creation_time;
@@ -1208,16 +1598,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
recv_end_check, recv_end_sync, ds, &resa, 3);
if (err) {
- if (drc->drc_newfs) {
- ASSERT(ds == drc->drc_real_ds);
- (void) dsl_dataset_destroy(ds, dmu_recv_tag);
- return (err);
- } else {
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
- }
+ /* swap back */
+ (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
}
- /* release the hold from dmu_recv_begin */
+out:
+ mutex_exit(&ds->ds_recvlock);
dsl_dataset_disown(ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
return (err);
}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+ struct recvendsyncarg resa;
+ dsl_dataset_t *ds = drc->drc_logical_ds;
+ int err;
+
+ /*
+ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+ * expects it to have a ds_user_ptr (and zil), but clone_swap()
+ * can close it.
+ */
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+ resa.creation_time = drc->drc_drrb->drr_creation_time;
+ resa.toguid = drc->drc_drrb->drr_toguid;
+ resa.tosnap = drc->drc_tosnap;
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_end_check, recv_end_sync, ds, &resa, 3);
+ if (err) {
+ /* clean up the fs we just recv'd into */
+ (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
+ } else {
+ /* release the hold from dmu_recv_begin */
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ }
+ return (err);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+ if (drc->drc_logical_ds != drc->drc_real_ds)
+ return (dmu_recv_existing_end(drc));
+ else
+ return (dmu_recv_new_end(drc));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 89cbfad29f84..023f90e12e34 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -33,17 +32,13 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/callb.h>
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
+int zfs_pd_blks_max = 100;
-struct prefetch_data {
+typedef struct prefetch_data {
kmutex_t pd_mtx;
kcondvar_t pd_cv;
int pd_blks_max;
@@ -51,47 +46,46 @@ struct prefetch_data {
int pd_flags;
boolean_t pd_cancel;
boolean_t pd_exited;
-};
+} prefetch_data_t;
-struct traverse_data {
+typedef struct traverse_data {
spa_t *td_spa;
uint64_t td_objset;
blkptr_t *td_rootbp;
uint64_t td_min_txg;
int td_flags;
- struct prefetch_data *td_pfd;
+ prefetch_data_t *td_pfd;
blkptr_cb_t *td_func;
void *td_arg;
-};
+} traverse_data_t;
-static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object);
-/* ARGSUSED */
-static void
+static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
- return;
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
- zb.zb_objset = td->td_objset;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
+
+ return (0);
}
-/* ARGSUSED */
-static void
+static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
if (lrc->lrc_txtype == TX_WRITE) {
lr_write_t *lr = (lr_write_t *)lrc;
@@ -99,28 +93,29 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
- zb.zb_objset = td->td_objset;
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
+ td->td_arg);
}
+ return (0);
}
static void
-traverse_zil(struct traverse_data *td, zil_header_t *zh)
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
{
uint64_t claim_txg = zh->zh_claim_txg;
zilog_t *zilog;
/*
* We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
+ * replayed; plus, in read-only mode, blocks that are already stable.
*/
if (claim_txg == 0 && spa_writeable(td->td_spa))
return;
@@ -134,16 +129,18 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
}
static int
-traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
- int err = 0;
+ int err = 0, lasterr = 0;
arc_buf_t *buf = NULL;
- struct prefetch_data *pd = td->td_pfd;
+ prefetch_data_t *pd = td->td_pfd;
+ boolean_t hard = td->td_flags & TRAVERSE_HARD;
if (bp->blk_birth == 0) {
- err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+ td->td_arg);
return (err);
}
@@ -163,7 +160,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
}
if (td->td_flags & TRAVERSE_PRE) {
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
if (err)
return (err);
}
@@ -174,7 +174,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -187,15 +187,18 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, buf, cbp, &czb);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -203,33 +206,43 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
/* recursively visitbp() blocks below this */
dnp = buf->b_data;
- for (i = 0; i < epb && err == 0; i++, dnp++) {
+ for (i = 0; i < epb; i++, dnp++) {
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
zb->zb_blkid * epb + i);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
objset_phys_t *osp;
dnode_phys_t *dnp;
- err = arc_read_nolock(NULL, td->td_spa, bp,
+ err = dsl_read_nolock(NULL, td->td_spa, bp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
osp = buf->b_data;
- traverse_zil(td, &osp->os_zil_header);
-
dnp = &osp->os_meta_dnode;
- err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ if (err && hard) {
+ lasterr = err;
+ err = 0;
+ }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
DMU_USERUSED_OBJECT);
}
+ if (err && hard) {
+ lasterr = err;
+ err = 0;
+ }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@@ -240,35 +253,54 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
- if (err == 0 && (td->td_flags & TRAVERSE_POST))
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
+ }
- return (err);
+ return (err != 0 ? err : lasterr);
}
static int
-traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
{
- int j, err = 0;
+ int j, err = 0, lasterr = 0;
zbookmark_t czb;
+ boolean_t hard = (td->td_flags & TRAVERSE_HARD);
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_blkptr[j], &czb);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
- return (err);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset,
+ object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_spill, &czb);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ }
+ }
+ return (err != 0 ? err : lasterr);
}
/* ARGSUSED */
static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+ void *arg)
{
- struct prefetch_data *pfd = arg;
+ prefetch_data_t *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
ASSERT(pfd->pd_blks_fetched >= 0);
@@ -276,7 +308,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
return (EINTR);
if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
return (0);
mutex_enter(&pfd->pd_mtx);
@@ -286,7 +319,7 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
- (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+ (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, zb);
@@ -297,15 +330,16 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
static void
traverse_prefetch_thread(void *arg)
{
- struct traverse_data *td_main = arg;
- struct traverse_data td = *td_main;
+ traverse_data_t *td_main = arg;
+ traverse_data_t td = *td_main;
zbookmark_t czb;
td.td_func = traverse_prefetcher;
td.td_arg = td_main->td_pfd;
td.td_pfd = NULL;
- SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -319,16 +353,16 @@ traverse_prefetch_thread(void *arg)
* in syncing context).
*/
static int
-traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
{
- struct traverse_data td;
- struct prefetch_data pd = { 0 };
+ traverse_data_t td;
+ prefetch_data_t pd = { 0 };
zbookmark_t czb;
int err;
td.td_spa = spa;
- td.td_objset = objset;
+ td.td_objset = ds ? ds->ds_object : 0;
td.td_rootbp = rootbp;
td.td_min_txg = txg_start;
td.td_func = func;
@@ -336,17 +370,29 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
td.td_pfd = &pd;
td.td_flags = flags;
- pd.pd_blks_max = 100;
+ pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+ /* See comment on ZIL traversal in dsl_scan_visitds. */
+ if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
+ objset_t *os;
+
+ err = dmu_objset_from_ds(ds, &os);
+ if (err)
+ return (err);
+
+ traverse_zil(&td, &os->os_zil_header);
+ }
+
if (!(flags & TRAVERSE_PREFETCH) ||
0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
&td, TQ_NOQUEUE))
pd.pd_exited = B_TRUE;
- SET_BOOKMARK(&czb, objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
mutex_enter(&pd.pd_mtx);
@@ -370,7 +416,7 @@ int
traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
- return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
&ds->ds_phys->ds_bp, txg_start, flags, func, arg));
}
@@ -378,43 +424,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
* NB: pool must not be changing on-disk (eg, from zdb or sync context).
*/
int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
{
- int err;
+ int err, lasterr = 0;
uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
+ boolean_t hard = (flags & TRAVERSE_HARD);
/* visit the MOS */
- err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
- 0, TRAVERSE_PRE, func, arg);
+ err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
+ txg_start, flags, func, arg);
if (err)
return (err);
/* visit each dataset */
- for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+ for (obj = 1; err == 0 || (err != ESRCH && hard);
+ err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
- if (err)
- return (err);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ continue;
+ }
if (doi.doi_type == DMU_OT_DSL_DATASET) {
dsl_dataset_t *ds;
+ uint64_t txg = txg_start;
+
rw_enter(&dp->dp_config_rwlock, RW_READER);
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
rw_exit(&dp->dp_config_rwlock);
- if (err)
- return (err);
- err = traverse_dataset(ds,
- ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
- func, arg);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ continue;
+ }
+ if (ds->ds_phys->ds_prev_snap_txg > txg)
+ txg = ds->ds_phys->ds_prev_snap_txg;
+ err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
- if (err)
- return (err);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ }
}
}
if (err == ESRCH)
err = 0;
- return (err);
+ return (err != 0 ? err : lasterr);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index b6a5cdbb89cd..81b8436707ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -33,7 +32,10 @@
#include <sys/dsl_pool.h>
#include <sys/zap_impl.h> /* for fzap_default_block_shift */
#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
+#include <sys/varargs.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
@@ -48,6 +50,8 @@ dmu_tx_create_dd(dsl_dir_t *dd)
tx->tx_pool = dd->dd_pool;
list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
offsetof(dmu_tx_hold_t, txh_node));
+ list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
#ifdef ZFS_DEBUG
refcount_create(&tx->tx_space_written);
refcount_create(&tx->tx_space_freed);
@@ -58,9 +62,9 @@ dmu_tx_create_dd(dsl_dir_t *dd)
dmu_tx_t *
dmu_tx_create(objset_t *os)
{
- dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
tx->tx_objset = os;
- tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
return (tx);
}
@@ -98,7 +102,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
int err;
if (object != DMU_NEW_OBJECT) {
- err = dnode_hold(os->os, object, tx, &dn);
+ err = dnode_hold(os, object, tx, &dn);
if (err) {
tx->tx_err = err;
return (NULL);
@@ -161,38 +165,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
}
static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
- boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+ int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
{
- int i = db->db_level + 1;
- dnode_t *dn = db->db_dnode;
-
- if (i >= dn->dn_nlevels)
+ objset_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent = NULL;
+ blkptr_t *bp = NULL;
+ uint64_t space;
+
+ if (level >= dn->dn_nlevels || history[level] == blkid)
return;
- db = db->db_parent;
- if (db == NULL) {
- uint64_t lvls = dn->dn_nlevels - i;
+ history[level] = blkid;
- txh->txh_space_towrite += lvls << dn->dn_indblkshift;
- return;
+ space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+ if (db == NULL || db == dn->dn_dbuf) {
+ ASSERT(level != 0);
+ db = NULL;
+ } else {
+ ASSERT(DB_DNODE(db) == dn);
+ ASSERT(db->db_level == level);
+ ASSERT(db->db.db_size == space);
+ ASSERT(db->db_blkid == blkid);
+ bp = db->db_blkptr;
+ parent = db->db_parent;
}
- if (db != history[i]) {
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- uint64_t space = 1ULL << dn->dn_indblkshift;
+ freeable = (bp && (freeable ||
+ dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
- freeable = (db->db_blkptr && (freeable ||
- dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
- if (freeable)
- txh->txh_space_tooverwrite += space;
- else
- txh->txh_space_towrite += space;
- if (db->db_blkptr)
- txh->txh_space_tounref += space;
- history[i] = db;
- dmu_tx_count_indirects(txh, db, freeable, history);
- }
+ if (freeable)
+ txh->txh_space_tooverwrite += space;
+ else
+ txh->txh_space_towrite += space;
+ if (bp)
+ txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+ dmu_tx_count_twig(txh, dn, parent, level + 1,
+ blkid >> epbs, freeable, history);
}
/* ARGSUSED */
@@ -213,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
max_ibs = DN_MAX_INDBLKSHIFT;
if (dn) {
- dmu_buf_impl_t *last[DN_MAX_LEVELS];
+ uint64_t history[DN_MAX_LEVELS];
int nlvls = dn->dn_nlevels;
int delta;
@@ -221,7 +234,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* For i/o error checking, read the first and last level-0
* blocks (if they are not aligned), and all the level-1 blocks.
*/
-
if (dn->dn_maxblkid == 0) {
delta = dn->dn_datablksz;
start = (off < dn->dn_datablksz) ? 0 : 1;
@@ -247,7 +259,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
/* last level-0 block */
end = (off+len-1) >> dn->dn_datablkshift;
- if (end != start &&
+ if (end != start && end <= dn->dn_maxblkid &&
P2PHASE(off+len, dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(zio, dn, 0, end);
if (err)
@@ -290,29 +302,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* If this write is not off the end of the file
* we need to account for overwrites/unref.
*/
- if (start <= dn->dn_maxblkid)
- bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ if (start <= dn->dn_maxblkid) {
+ for (int l = 0; l < DN_MAX_LEVELS; l++)
+ history[l] = -1ULL;
+ }
while (start <= dn->dn_maxblkid) {
- spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold_level(dn, 0, start, FTAG);
+ err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
- if (db->db_blkptr && dsl_dataset_block_freeable(ds,
- db->db_blkptr->blk_birth)) {
- dprintf_bp(db->db_blkptr, "can free old%s", "");
- txh->txh_space_tooverwrite += dn->dn_datablksz;
- txh->txh_space_tounref += dn->dn_datablksz;
- dmu_tx_count_indirects(txh, db, TRUE, last);
- } else {
- txh->txh_space_towrite += dn->dn_datablksz;
- if (db->db_blkptr)
- txh->txh_space_tounref +=
- bp_get_dasize(spa, db->db_blkptr);
- dmu_tx_count_indirects(txh, db, FALSE, last);
+
+ if (err) {
+ txh->txh_tx->tx_err = err;
+ return;
}
+
+ dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+ history);
dbuf_rele(db, FTAG);
if (++start > end) {
/*
@@ -377,13 +384,13 @@ static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
dnode_t *dn = txh->txh_dnode;
- dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+ dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
uint64_t space = mdn->dn_datablksz +
((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
if (dn && dn->dn_dbuf->db_blkptr &&
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_dbuf->db_blkptr->blk_birth)) {
+ dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
txh->txh_space_tooverwrite += space;
txh->txh_space_tounref += space;
} else {
@@ -428,7 +435,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* The struct_rwlock protects us against dn_nlevels
* changing, in case (against all odds) we manage to dirty &
* sync out the changes after we check for being dirty.
- * Also, dbuf_hold_level() wants us to have the struct_rwlock.
+ * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -458,9 +465,9 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkptr_t *bp = dn->dn_phys->dn_blkptr;
ASSERT3U(blkid + i, <, dn->dn_nblkptr);
bp += blkid + i;
- if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+ if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
- space += bp_get_dasize(spa, bp);
+ space += bp_get_dsize(spa, bp);
}
unref += BP_GET_ASIZE(bp);
}
@@ -516,14 +523,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks);
- dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
-
- txh->txh_memory_tohold += dbuf->db.db_size;
- if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
- txh->txh_tx->tx_err = E2BIG;
- dbuf_rele(dbuf, FTAG);
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+ if (err) {
+ txh->txh_tx->tx_err = err;
break;
}
+
+ txh->txh_memory_tohold += dbuf->db.db_size;
+
+ /*
+ * We don't check memory_tohold against DMU_MAX_ACCESS because
+ * memory_tohold is an over-estimation (especially the >L1
+ * indirect blocks), so it could fail. Callers should have
+ * already verified that they will not be holding too much
+ * memory.
+ */
+
err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
if (err != 0) {
txh->txh_tx->tx_err = err;
@@ -535,9 +550,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
bp += blkoff;
for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+ if (dsl_dataset_block_freeable(ds, &bp[i],
+ bp[i].blk_birth)) {
dprintf_bp(&bp[i], "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
+ space += bp_get_dsize(spa, &bp[i]);
}
unref += BP_GET_ASIZE(bp);
}
@@ -582,6 +598,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
if (len != DMU_OBJECT_END)
dmu_tx_count_write(txh, off+len, 1);
+ dmu_tx_count_dnode(txh);
+
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
@@ -624,7 +642,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
}
}
- dmu_tx_count_dnode(txh);
dmu_tx_count_free(txh, off, len);
}
@@ -674,6 +691,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
* the size will change between now and the dbuf dirty call.
*/
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ &dn->dn_phys->dn_blkptr[0],
dn->dn_phys->dn_blkptr[0].blk_birth)) {
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
} else {
@@ -689,7 +707,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
* access the name in this fat-zap so that we'll check
* for i/o errors to the leaf blocks, etc.
*/
- err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ err = zap_lookup(dn->dn_objset, dn->dn_object, name,
8, 0, NULL);
if (err == EIO) {
tx->tx_err = err;
@@ -697,7 +715,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
}
}
- err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+ err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
/*
@@ -769,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
{
dmu_tx_hold_t *txh;
int match_object = FALSE, match_offset = FALSE;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(tx->tx_txg != 0);
- ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
ASSERT3U(dn->dn_object, ==, db->db.db_object);
- if (tx->tx_anyobj)
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
return;
+ }
/* XXX No checking on the meta dnode for now */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
return;
+ }
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
@@ -809,10 +833,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
match_offset = TRUE;
/*
* We will let this hold work for the bonus
- * buffer so that we don't need to hold it
- * when creating a new object.
+ * or spill buffer so that we don't need to
+ * hold it when creating a new object.
*/
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID ||
+ blkid == DMU_SPILL_BLKID)
match_offset = TRUE;
/*
* They might have to increase nlevels,
@@ -833,8 +858,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
+ case THT_SPILL:
+ if (blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ break;
case THT_BONUS:
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
match_offset = TRUE;
break;
case THT_ZAP:
@@ -847,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
ASSERT(!"bad txh_type");
}
}
- if (match_object && match_offset)
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
return;
+ }
}
+ DB_DNODE_EXIT(db);
panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
(u_longlong_t)db->db.db_object, db->db_level,
(u_longlong_t)db->db_blkid);
@@ -932,7 +964,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
* assume that we won't be able to free or overwrite anything.
*/
if (tx->tx_objset &&
- dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
tx->tx_lastsnap_txg) {
towrite += tooverwrite;
tooverwrite = tofree = 0;
@@ -1113,8 +1145,13 @@ dmu_tx_commit(dmu_tx_t *tx)
if (tx->tx_tempreserve_cookie)
dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+ if (!list_is_empty(&tx->tx_callbacks))
+ txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
+
+ list_destroy(&tx->tx_callbacks);
list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
@@ -1143,6 +1180,14 @@ dmu_tx_abort(dmu_tx_t *tx)
if (dn != NULL)
dnode_rele(dn, tx);
}
+
+ /*
+ * Call any registered callbacks with an error code.
+ */
+ if (!list_is_empty(&tx->tx_callbacks))
+ dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+ list_destroy(&tx->tx_callbacks);
list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
refcount_destroy_many(&tx->tx_space_written,
@@ -1159,3 +1204,179 @@ dmu_tx_get_txg(dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
return (tx->tx_txg);
}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+ dmu_tx_callback_t *dcb;
+
+ dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+ dcb->dcb_func = func;
+ dcb->dcb_data = data;
+
+ list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+ dmu_tx_callback_t *dcb;
+
+ while (dcb = list_head(cb_list)) {
+ list_remove(cb_list, dcb);
+ dcb->dcb_func(dcb->dcb_data, error);
+ kmem_free(dcb, sizeof (dmu_tx_callback_t));
+ }
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed. If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+ int i;
+
+ if (!sa->sa_need_attr_registration)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (!sa->sa_attr_table[i].sa_registered) {
+ if (sa->sa_reg_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ else
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ }
+ }
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_hold_t *txh;
+ blkptr_t *bp;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+
+ dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ return;
+
+ /* If blkptr doesn't exist then add space to towrite */
+ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref = 0;
+ } else {
+ bp = &dn->dn_phys->dn_spill;
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ bp, bp->blk_birth))
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ if (bp->blk_birth)
+ txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ }
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+ else {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ return;
+
+ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+ THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function. It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+ uint64_t object;
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ ASSERT(hdl != NULL);
+
+ object = sa_handle_object(hdl);
+
+ dmu_tx_hold_bonus(tx, object);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ } else {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index acf628453571..b5ca66628f80 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -244,7 +244,7 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
break;
}
zs->zst_ph_offset = prefetch_tail;
- zs->zst_last = LBOLT;
+ zs->zst_last = ddi_get_lbolt();
}
void
@@ -405,6 +405,7 @@ top:
rc = 1;
goto out;
}
+
if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -432,6 +433,7 @@ top:
rc = 1;
goto out;
}
+
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -462,6 +464,7 @@ top:
rc = 1;
goto out;
}
+
if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
@@ -481,6 +484,7 @@ top:
rc = 1;
goto out;
}
+
if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
@@ -603,7 +607,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf)
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
- if (((LBOLT - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
break;
}
@@ -734,7 +738,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
newstream->zst_cap = zst.zst_len;
newstream->zst_direction = ZFETCH_FORWARD;
- newstream->zst_last = LBOLT;
+ newstream->zst_last = ddi_get_lbolt();
mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index f9661d62d93e..b43035bb2e23 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -39,19 +38,35 @@
static int free_range_compar(const void *node1, const void *node2);
static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define DNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define DNODE_STAT_ADD(stat) /* nothing */
+#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+#ifdef sun
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
- int i;
dnode_t *dn = arg;
- bzero(dn, sizeof (dnode_t));
+ int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -60,8 +75,18 @@ dnode_cons(void *arg, void *unused, int kmflag)
refcount_create(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
+ list_link_init(&dn->dn_dirty_link[i]);
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
@@ -70,9 +95,28 @@ dnode_cons(void *arg, void *unused, int kmflag)
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
+ dn->dn_moved = 0;
+ POINTER_INVALIDATE(&dn->dn_objset);
return (0);
}
@@ -89,27 +133,56 @@ dnode_dest(void *arg, void *unused)
cv_destroy(&dn->dn_notxholds);
refcount_destroy(&dn->dn_holds);
refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
avl_destroy(&dn->dn_ranges[i]);
list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
}
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_free_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT3U(dn->dn_dirtyctx, ==, 0);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT3U(dn->dn_oldused, ==, 0);
+ ASSERT3U(dn->dn_oldflags, ==, 0);
+ ASSERT3U(dn->dn_olduid, ==, 0);
+ ASSERT3U(dn->dn_oldgid, ==, 0);
+ ASSERT3U(dn->dn_newuid, ==, 0);
+ ASSERT3U(dn->dn_newgid, ==, 0);
+ ASSERT3U(dn->dn_id_flags, ==, 0);
+
+ ASSERT3U(dn->dn_dbufs_count, ==, 0);
list_destroy(&dn->dn_dbufs);
}
void
dnode_init(void)
{
+ ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t",
sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(dnode_cache, dnode_move);
}
void
dnode_fini(void)
{
kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
}
@@ -121,6 +194,7 @@ dnode_verify(dnode_t *dn)
ASSERT(dn->dn_phys);
ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
@@ -210,6 +284,11 @@ dnode_byteswap(dnode_phys_t *dnp)
ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
}
+
+ /* Swap SPILL block if we have one */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
}
void
@@ -258,6 +337,27 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_bonustype = newtype;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ dnode_setdirty(dn, tx);
+ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -272,18 +372,30 @@ dnode_setdblksz(dnode_t *dn, int size)
}
static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
- uint64_t object)
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
- dn->dn_objset = os;
+ ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
dn->dn_object = object;
dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
dn->dn_phys = dnp;
- if (dnp->dn_datablkszsec)
+ if (dnp->dn_datablkszsec) {
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
dn->dn_indblkshift = dnp->dn_indblkshift;
dn->dn_nlevels = dnp->dn_nlevels;
dn->dn_type = dnp->dn_type;
@@ -293,49 +405,71 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_maxblkid = dnp->dn_maxblkid;
+ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+ dn->dn_id_flags = 0;
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
mutex_enter(&os->os_lock);
list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning dn_objset makes the
+ * dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
static void
dnode_destroy(dnode_t *dn)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
-#ifdef ZFS_DEBUG
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
- ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
- }
- ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
- ASSERT(dn->dn_oldphys == NULL);
+ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
list_remove(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
- if (dn->dn_dirtyctx_firstset) {
+ /* the dnode can no longer move, so we can release the handle */
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ if (dn->dn_dirtyctx_firstset != NULL) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
- dmu_zfetch_rele(&dn->dn_zfetch);
- if (dn->dn_bonus) {
+ if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
}
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
}
@@ -367,6 +501,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT(ot != DMU_OT_NONE);
ASSERT3U(ot, <, DMU_OT_NUMTYPES);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
@@ -379,9 +514,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -392,7 +530,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
- dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ dn->dn_nblkptr = 1;
+ else
+ dn->dn_nblkptr = 1 +
+ ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -406,10 +548,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
}
dn->dn_allocated_txg = tx->tx_txg;
+ dn->dn_id_flags = 0;
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -425,13 +569,16 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
- (bonustype != DMU_OT_NONE && bonuslen != 0));
+ (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
+ dn->dn_id_flags = 0;
+
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_setdirty(dn, tx);
if (dn->dn_datablksz != blocksize) {
@@ -444,9 +591,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
}
if (dn->dn_bonuslen != bonuslen)
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ nblkptr = 1;
+ else
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (dn->dn_bonustype != bonustype)
+ dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
rw_exit(&dn->dn_struct_rwlock);
/* change type */
@@ -472,9 +629,306 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
+#ifdef DNODE_STATS
+static struct {
+ uint64_t dms_dnode_invalid;
+ uint64_t dms_dnode_recheck1;
+ uint64_t dms_dnode_recheck2;
+ uint64_t dms_dnode_special;
+ uint64_t dms_dnode_handle;
+ uint64_t dms_dnode_rwlock;
+ uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+ refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(list_is_empty(&ndn->dn_dbufs));
+ list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+ ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+ ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ odn->dn_zfetch.zf_dnode = NULL;
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_ranges[i].avl_root = NULL;
+ odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef sun
+#ifdef _KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = odn->dn_dbufs_count;
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == ndn->dn_dbufs_count);
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+#endif /* sun */
+
void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
{
+ dnode_t *dn = dnh->dnh_dnode;
+
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
@@ -483,13 +937,19 @@ dnode_special_close(dnode_t *dn)
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+ dnh->dnh_dnode = dn;
+ zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
return (dn);
}
@@ -497,34 +957,43 @@ dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
static void
dnode_buf_pageout(dmu_buf_t *db, void *arg)
{
- dnode_t **children_dnodes = arg;
+ dnode_children_t *children_dnodes = arg;
int i;
int epb = db->db_size >> DNODE_SHIFT;
+ ASSERT(epb == children_dnodes->dnc_count);
+
for (i = 0; i < epb; i++) {
- dnode_t *dn = children_dnodes[i];
- int n;
+ dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ dnode_t *dn;
- if (dn == NULL)
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (dnh->dnh_dnode == NULL) {
+ zrl_destroy(&dnh->dnh_zrlock);
continue;
-#ifdef ZFS_DEBUG
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
- * it wouldn't be eligable for eviction and this function
+ * it wouldn't be eligible for eviction and this function
* would not have been called.
*/
ASSERT(refcount_is_zero(&dn->dn_holds));
- ASSERT(list_head(&dn->dn_dbufs) == NULL);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- for (n = 0; n < TXG_SIZE; n++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
- children_dnodes[i] = NULL;
- dnode_destroy(dn);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
}
/*
@@ -534,7 +1003,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
void *tag, dnode_t **dnp)
{
int epb, idx, err;
@@ -543,17 +1012,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_t **children_dnodes;
+ dnode_children_t *children_dnodes;
+ dnode_handle_t *dnh;
/*
* If you are holding the spa config lock as writer, you shouldn't
- * be asking the DMU to do *anything*.
+ * be asking the DMU to do *anything* unless it's the root pool
+ * which may require us to read from the root filesystem while
+ * holding some (not all) of the locks as writer.
*/
- ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+ (spa_is_root(os->os_spa) &&
+ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
- os->os_userused_dnode : os->os_groupused_dnode;
+ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
return (ENOENT);
type = dn->dn_type;
@@ -570,7 +1044,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
- mdn = os->os_meta_dnode;
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
DNODE_VERIFY(mdn);
@@ -597,26 +1072,39 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
idx = object & (epb-1);
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- dnode_t **winner;
- children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
- KM_SLEEP);
+ int i;
+ dnode_children_t *winner;
+ children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ children_dnodes->dnc_count = epb;
+ dnh = &children_dnodes->dnc_children[0];
+ for (i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+ dnh[i].dnh_dnode = NULL;
+ }
if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
dnode_buf_pageout)) {
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
+ ASSERT(children_dnodes->dnc_count == epb);
- if ((dn = children_dnodes[idx]) == NULL) {
- dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ if ((dn = dnh->dnh_dnode) == NULL) {
+ dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, dnp, db, object);
- winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ dn = dnode_create(os, phys, db, object, dnh);
+ winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
if (winner != NULL) {
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
dn = winner;
}
}
@@ -626,15 +1114,18 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || dn->dn_oldphys))) {
+ (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
+ zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dn);
+ dbuf_add_ref(db, dnh);
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
@@ -649,7 +1140,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
* Return held dnode if the object is allocated, NULL if not.
*/
int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
@@ -676,19 +1167,43 @@ void
dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
- if (refs == 0 && dn->dn_dbuf)
- dbuf_rele(dn->dn_dbuf, dn);
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ dbuf_rele(db, dnh);
+ }
}
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
@@ -701,10 +1216,15 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
#ifdef ZFS_DEBUG
mutex_enter(&dn->dn_mtx);
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
- /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
mutex_exit(&dn->dn_mtx);
#endif
+ /*
+ * Determine old uid/gid when necessary
+ */
+ dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
mutex_enter(&os->os_lock);
/*
@@ -719,6 +1239,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_datablksz != 0);
ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
@@ -734,7 +1255,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
- * dbuf maintaines a hold on the dnode. When the last child
+ * dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
@@ -813,7 +1334,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -857,7 +1379,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
int epbs, new_nlevels;
uint64_t sz;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(have_read ?
RW_READ_HELD(&dn->dn_struct_rwlock) :
@@ -904,6 +1426,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
@@ -914,7 +1437,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
@@ -1169,6 +1693,20 @@ out:
rw_exit(&dn->dn_struct_rwlock);
}
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+ int i;
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
@@ -1177,7 +1715,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
return (FALSE);
/*
@@ -1190,6 +1728,9 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
if (dn->dn_free_txg)
return (TRUE);
+ if (blkid == DMU_SPILL_BLKID)
+ return (dnode_spill_freed(dn));
+
range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
@@ -1247,7 +1788,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
if (space > 0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 3bf0c81d0992..32afe7d74735 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -77,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
if (child == NULL)
continue;
- ASSERT3P(child->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
if (child->db_parent && child->db_parent != dn->dn_dbuf) {
ASSERT(child->db_parent->db_level == db->db_level);
ASSERT(child->db_blkptr !=
@@ -120,7 +123,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
bzero(bp, sizeof (blkptr_t));
blocks_freed += 1;
@@ -136,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
int off, num;
int i, err, epbs;
uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
off = start - (db->db_blkid * 1<<epbs);
num = end - start + 1;
ASSERT3U(off, >=, 0);
ASSERT3U(num, >=, 0);
ASSERT3U(db->db_level, >, 0);
- ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
ASSERT(db->db_blkptr != NULL);
@@ -156,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
ASSERT(err == 0);
@@ -201,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
dbuf_rele(child, FTAG);
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -210,7 +217,7 @@ static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
@@ -228,10 +235,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- arc_release(db->db_buf, db);
+ dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
shift = (db->db_level - 1) * epbs;
dbstart = db->db_blkid << epbs;
start = blkid >> shift;
@@ -254,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
@@ -273,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
}
dbuf_rele(subdb, FTAG);
}
+ DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
@@ -376,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
- ASSERT3P(db->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
@@ -424,6 +439,9 @@ dnode_undirty_dbufs(list_t *list)
dmu_buf_impl_t *db = dr->dr_dbuf;
uint64_t txg = dr->dr_txg;
+ if (db->db_level != 0)
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
mutex_enter(&db->db_mtx);
/* XXX - use dbuf_undirty()? */
list_remove(list, dr);
@@ -431,18 +449,15 @@ dnode_undirty_dbufs(list_t *list)
db->db_last_dirty = NULL;
db->db_dirtycnt -= 1;
if (db->db_level == 0) {
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_data == db->db_buf);
dbuf_unoverride(dr);
- mutex_exit(&db->db_mtx);
} else {
- mutex_exit(&db->db_mtx);
- dnode_undirty_dbufs(&dr->dt.di.dr_children);
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
}
}
@@ -493,6 +508,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
+ dn->dn_have_spill = B_FALSE;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -515,6 +531,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
static const dnode_phys_t zerodn = { 0 };
+ boolean_t kill_spill = B_FALSE;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -526,10 +543,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
- ASSERT(dn->dn_oldphys == NULL);
- dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
- *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+ dn->dn_oldflags = dn->dn_phys->dn_flags;
dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ mutex_exit(&dn->dn_mtx);
+ dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
} else {
/* Once we account for it, we should always account for it. */
ASSERT(!(dn->dn_phys->dn_flags &
@@ -560,6 +579,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+ avl_last(&dn->dn_ranges[txgoff]) ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -576,6 +596,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_bonuslen[txgoff] = 0;
}
+ if (dn->dn_next_bonustype[txgoff]) {
+ ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+ dn->dn_next_bonustype[txgoff] = 0;
+ }
+
+ /*
+ * We will either remove a spill block when a file is being removed
+ * or we have been asked to remove it.
+ */
+ if (dn->dn_rm_spillblk[txgoff] ||
+ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+ dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
+ if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ kill_spill = B_TRUE;
+ dn->dn_rm_spillblk[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -592,6 +630,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
+ if (kill_spill) {
+ (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
/* process all the "freed" ranges in the file */
while (rp = avl_last(&dn->dn_ranges[txgoff])) {
dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index ac9d67f671f6..19b663e3ec0a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu_objset.h>
@@ -38,16 +37,24 @@
#include <sys/zfs_ioctl.h>
#include <sys/spa.h>
#include <sys/zfs_znode.h>
-#include <sys/sunddi.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
static char *dsl_reaper = "the grim reaper";
static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
@@ -76,14 +83,14 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
}
void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
- dprintf_bp(bp, "born, ds=%p\n", ds);
+ dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
/* It could have been compressed away to nothing */
@@ -103,6 +110,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
return;
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
@@ -119,29 +127,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
}
int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
- dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+ boolean_t async)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
-
- ASSERT(pio != NULL);
- ASSERT(dmu_tx_is_syncing(tx));
- /* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
return (0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(bp->blk_birth <= tx->tx_txg);
+
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
ASSERT(used > 0);
if (ds == NULL) {
- int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-used, -compressed, -uncompressed, tx);
@@ -154,13 +159,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
- int err;
int64_t delta;
- dprintf_bp(bp, "freeing: %s", "");
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
@@ -176,7 +178,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
mutex_exit(&ds->ds_dir->dd_lock);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ if (async) {
+ /*
+ * We are here as part of zio's write done callback,
+ * which means we're a zio interrupt thread. We can't
+ * call dsl_deadlist_insert() now because it may block
+ * waiting for I/O. Instead, put bp on the deferred
+ * queue and let dsl_pool_sync() finish the job.
+ */
+ bplist_append(&ds->ds_pending_deadlist, bp);
+ } else {
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+ }
ASSERT3U(ds->ds_prev->ds_object, ==,
ds->ds_phys->ds_prev_snap_obj);
ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -189,7 +202,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
ds->ds_prev->ds_phys->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
- if (bp->blk_birth > ds->ds_origin_txg) {
+ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -230,9 +243,15 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
}
boolean_t
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+ uint64_t blk_birth)
{
- return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+ if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
+ return (B_FALSE);
+
+ ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+
+ return (B_TRUE);
}
/* ARGSUSED */
@@ -243,19 +262,23 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
- dprintf_ds(ds, "evicting %s\n", "");
-
unique_remove(ds->ds_fsid_guid);
- if (ds->ds_user_ptr != NULL)
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ if (ds->ds_objset != NULL)
+ dmu_objset_evict(ds->ds_objset);
if (ds->ds_prev) {
dsl_dataset_drop_ref(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ if (db != NULL) {
+ dsl_deadlist_close(&ds->ds_deadlist);
+ } else {
+ ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
+ ASSERT(!ds->ds_deadlist.dl_oldfmt);
+ }
if (ds->ds_dir)
dsl_dir_close(ds->ds_dir, ds);
@@ -264,12 +287,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
if (mutex_owned(&ds->ds_lock))
mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
if (mutex_owned(&ds->ds_opening_lock))
mutex_exit(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_opening_lock);
- if (mutex_owned(&ds->ds_deadlist.bpl_lock))
- mutex_exit(&ds->ds_deadlist.bpl_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
@@ -329,6 +350,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
matchtype_t mt;
int err;
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_FIRST;
else
@@ -348,6 +371,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
int err;
+ dmu_object_info_t doi;
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
dsl_pool_sync_context(dp));
@@ -355,6 +379,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
if (err)
return (err);
+
+ /* Make sure dsobj has the correct object type. */
+ dmu_object_info_from_db(dbuf, &doi);
+ if (doi.doi_type != DMU_OT_DSL_DATASET)
+ return (EINVAL);
+
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
dsl_dataset_t *winner;
@@ -365,28 +395,27 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
ds->ds_phys = dbuf->db_data;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
- NULL);
rw_init(&ds->ds_rwlock, 0, 0, 0);
cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
- err = bplist_open(&ds->ds_deadlist,
+ bplist_create(&ds->ds_pending_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
+
if (err == 0) {
err = dsl_dir_open_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
}
if (err) {
- /*
- * we don't really need to close the blist if we
- * just opened it.
- */
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
@@ -399,21 +428,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
ds->ds_phys->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
-
- if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
- dsl_dataset_t *origin;
-
- err = dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj,
- FTAG, &origin);
- if (err == 0) {
- ds->ds_origin_txg =
- origin->ds_phys->ds_creation_txg;
- dsl_dataset_rele(origin, FTAG);
- }
+ } else {
+ if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+ err = dsl_dataset_get_snapname(ds);
+ if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+ err = zap_count(
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj,
+ &ds->ds_userrefs);
}
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
}
if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
@@ -449,13 +472,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_evict);
}
if (err || winner) {
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
if (ds->ds_prev)
dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
@@ -551,17 +575,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
}
int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
- dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
-
- ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
-
+ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
if (err)
return (err);
- if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
- dsl_dataset_rele(*dsp, owner);
+ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+ dsl_dataset_rele(*dsp, tag);
*dsp = NULL;
return (EBUSY);
}
@@ -628,18 +649,14 @@ out:
}
int
-dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold(name, owner, dsp);
+ int err = dsl_dataset_hold(name, tag, dsp);
if (err)
return (err);
- if ((*dsp)->ds_phys->ds_num_children > 0 &&
- !DS_MODE_IS_READONLY(flags)) {
- dsl_dataset_rele(*dsp, owner);
- return (EROFS);
- }
- if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
- dsl_dataset_rele(*dsp, owner);
+ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+ dsl_dataset_rele(*dsp, tag);
return (EBUSY);
}
return (0);
@@ -711,9 +728,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
}
void
-dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
{
- ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+ ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
(DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
mutex_enter(&ds->ds_lock);
@@ -724,20 +741,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
}
mutex_exit(&ds->ds_lock);
if (ds->ds_dbuf)
- dsl_dataset_drop_ref(ds, owner);
+ dsl_dataset_drop_ref(ds, tag);
else
- dsl_dataset_evict(ds->ds_dbuf, ds);
+ dsl_dataset_evict(NULL, ds);
}
boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
{
boolean_t gotit = FALSE;
mutex_enter(&ds->ds_lock);
if (ds->ds_owner == NULL &&
(!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
- ds->ds_owner = owner;
+ ds->ds_owner = tag;
if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
rw_exit(&ds->ds_rwlock);
gotit = TRUE;
@@ -788,10 +805,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- if (origin) {
+ if (origin == NULL) {
+ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+ } else {
+ dsl_dataset_t *ohds;
+
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
origin->ds_phys->ds_creation_txg;
@@ -807,6 +826,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+ dsl_dataset_rele(ohds, FTAG);
+
if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
if (origin->ds_phys->ds_next_clones_obj == 0) {
origin->ds_phys->ds_next_clones_obj =
@@ -820,6 +845,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_origin_obj = origin->ds_object;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones =
+ zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(mos,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+ }
}
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -852,6 +887,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dir_close(dd, FTAG);
+ /*
+ * If we are creating a clone, make sure we zero out any stale
+ * data from the origin snapshots zil header.
+ */
+ if (origin != NULL) {
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ dsl_dataset_dirty(ds, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
return (dsobj);
}
@@ -859,30 +909,29 @@ struct destroyarg {
dsl_sync_task_group_t *dstg;
char *snapname;
char *failed;
+ boolean_t defer;
};
static int
-dsl_snapshot_destroy_one(char *name, void *arg)
+dsl_snapshot_destroy_one(const char *name, void *arg)
{
struct destroyarg *da = arg;
dsl_dataset_t *ds;
- char *cp;
int err;
+ char *dsname;
- (void) strcat(name, "@");
- (void) strcat(name, da->snapname);
- err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- da->dstg, &ds);
- cp = strchr(name, '@');
- *cp = '\0';
+ dsname = kmem_asprintf("%s@%s", name, da->snapname);
+ err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
+ strfree(dsname);
if (err == 0) {
+ struct dsl_ds_destroyarg *dsda;
+
dsl_dataset_make_exclusive(ds, da->dstg);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
+ dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
+ dsda->ds = ds;
+ dsda->defer = da->defer;
dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ dsl_dataset_destroy_sync, dsda, da->dstg, 0);
} else if (err == ENOENT) {
err = 0;
} else {
@@ -896,7 +945,7 @@ dsl_snapshot_destroy_one(char *name, void *arg)
*/
#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
int
-dsl_snapshots_destroy(char *fsname, char *snapname)
+dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
{
int err;
struct destroyarg da;
@@ -909,6 +958,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
da.snapname = snapname;
da.failed = fsname;
+ da.defer = defer;
err = dmu_objset_find(fsname,
dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
@@ -918,7 +968,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
for (dst = list_head(&da.dstg->dstg_tasks); dst;
dst = list_next(&da.dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
+ struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
+ dsl_dataset_t *ds = dsda->ds;
+
/*
* Return the file system name that triggered the error
*/
@@ -926,7 +978,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
dsl_dataset_name(ds, fsname);
*strchr(fsname, '@') = '\0';
}
+ ASSERT3P(dsda->rm_origin, ==, NULL);
dsl_dataset_disown(ds, da.dstg);
+ kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
}
dsl_sync_task_group_destroy(da.dstg);
@@ -934,34 +988,94 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
return (err);
}
+static boolean_t
+dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+{
+ boolean_t might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds))
+ might_destroy = B_TRUE;
+ mutex_exit(&ds->ds_lock);
+
+ return (might_destroy);
+}
+
+/*
+ * If we're removing a clone, and these three conditions are true:
+ * 1) the clone's origin has no other children
+ * 2) the clone's origin has no user references
+ * 3) the clone's origin has been marked for deferred destruction
+ * Then, prepare to remove the origin as part of this sync task group.
+ */
+static int
+dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *origin = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(origin)) {
+ char *name;
+ int namelen;
+ int error;
+
+ namelen = dsl_dataset_namelen(origin) + 1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(origin, name);
+#ifdef _KERNEL
+ error = zfs_unmount_snap(name, NULL);
+ if (error) {
+ kmem_free(name, namelen);
+ return (error);
+ }
+#endif
+ error = dsl_dataset_own(name, B_TRUE, tag, &origin);
+ kmem_free(name, namelen);
+ if (error)
+ return (error);
+ dsda->rm_origin = origin;
+ dsl_dataset_make_exclusive(origin, tag);
+ }
+
+ return (0);
+}
+
/*
* ds must be opened as OWNER. On return (whether successful or not),
* ds will be closed and caller can no longer dereference it.
*/
int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
{
int err;
dsl_sync_task_group_t *dstg;
objset_t *os;
dsl_dir_t *dd;
uint64_t obj;
+ struct dsl_ds_destroyarg dsda = { 0 };
+ dsl_dataset_t dummy_ds = { 0 };
+
+ dsda.ds = ds;
if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
+ dsda.defer = defer;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, tag, 0);
+ &dsda, tag, 0);
+ ASSERT3P(dsda.rm_origin, ==, NULL);
+ goto out;
+ } else if (defer) {
+ err = EINVAL;
goto out;
}
dd = ds->ds_dir;
+ dummy_ds.ds_dir = dd;
+ dummy_ds.ds_object = ds->ds_object;
/*
* Check for errors and mark this ds as inconsistent, in
@@ -972,7 +1086,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
if (err)
goto out;
- err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ err = dmu_objset_from_ds(ds, &os);
if (err)
goto out;
@@ -988,11 +1102,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
*/
(void) dmu_free_object(os, obj);
}
+ if (err != ESRCH)
+ goto out;
+
+ /*
+ * Only the ZIL knows how to free log blocks.
+ */
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
/*
- * We need to sync out all in-flight IO before we try to evict
- * (the dataset evict func is trying to clear the cached entries
- * for this dataset in the ARC).
+ * Sync out all in-flight IO.
*/
txg_wait_synced(dd->dd_pool, 0);
@@ -1001,7 +1120,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
* context, the user space accounting should be zero.
*/
if (ds->ds_phys->ds_bp.blk_fill == 0 &&
- dmu_objset_userused_enabled(os->os)) {
+ dmu_objset_userused_enabled(os)) {
uint64_t count;
ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
@@ -1010,10 +1129,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
count == 0);
}
- dmu_objset_close(os);
- if (err != ESRCH)
- goto out;
-
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
rw_exit(&dd->dd_pool->dp_config_rwlock);
@@ -1021,30 +1136,48 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
if (err)
goto out;
- if (ds->ds_user_ptr) {
- /*
- * We need to sync out all in-flight IO before we try
- * to evict (the dataset evict func is trying to clear
- * the cached entries for this dataset in the ARC).
- */
- txg_wait_synced(dd->dd_pool, 0);
- }
-
/*
* Blow away the dsl_dir + head dataset.
*/
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
- dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, tag, 0);
- dsl_sync_task_create(dstg, dsl_dir_destroy_check,
- dsl_dir_destroy_sync, dd, FTAG, 0);
- err = dsl_sync_task_group_wait(dstg);
- dsl_sync_task_group_destroy(dstg);
+ /*
+ * If we're removing a clone, we might also need to remove its
+ * origin.
+ */
+ do {
+ dsda.need_prep = B_FALSE;
+ if (dsl_dir_is_clone(dd)) {
+ err = dsl_dataset_origin_rm_prep(&dsda, tag);
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ goto out;
+ }
+ }
+
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, &dsda, tag, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+
+ /*
+ * We could be racing against 'zfs release' or 'zfs destroy -d'
+ * on the origin snap, in which case we can get EBUSY if we
+ * needed to destroy the origin snap but were not ready to
+ * do so.
+ */
+ if (dsda.need_prep) {
+ ASSERT(err == EBUSY);
+ ASSERT(dsl_dir_is_clone(dd));
+ ASSERT(dsda.rm_origin == NULL);
+ }
+ } while (dsda.need_prep);
+
+ if (dsda.rm_origin != NULL)
+ dsl_dataset_disown(dsda.rm_origin, tag);
+
/* if it is successful, dsl_dir_destroy_sync will close the dd */
if (err)
dsl_dir_close(dd, FTAG);
@@ -1053,47 +1186,6 @@ out:
return (err);
}
-int
-dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
-{
- int err;
-
- ASSERT(ds->ds_owner);
-
- dsl_dataset_make_exclusive(ds, ds->ds_owner);
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
- ds, &ost, 0);
- /* drop exclusive access */
- mutex_enter(&ds->ds_lock);
- rw_exit(&ds->ds_rwlock);
- cv_broadcast(&ds->ds_exclusive_cv);
- mutex_exit(&ds->ds_lock);
- return (err);
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func)
-{
- void *old;
-
- mutex_enter(&ds->ds_lock);
- old = ds->ds_user_ptr;
- if (old == NULL) {
- ds->ds_user_ptr = p;
- ds->ds_user_evict_func = func;
- }
- mutex_exit(&ds->ds_lock);
- return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
- return (ds->ds_user_ptr);
-}
-
blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
@@ -1127,7 +1219,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
if (ds == NULL) /* this is the meta-objset */
return;
- ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_objset != NULL);
if (ds->ds_phys->ds_next_snap_obj != 0)
panic("dirtying snapshot!");
@@ -1154,62 +1246,51 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
uint64_t mrs_used;
uint64_t dlused, dlcomp, dluncomp;
- ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+ ASSERT(!dsl_dataset_is_snapshot(ds));
if (ds->ds_phys->ds_prev_snap_obj != 0)
mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
else
mrs_used = 0;
- VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
- &dluncomp));
+ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
ASSERT3U(dlused, <=, mrs_used);
ds->ds_phys->ds_unique_bytes =
ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
- if (!DS_UNIQUE_IS_ACCURATE(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
}
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
-{
- if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
- dsl_dataset_recalc_head_uniq(ds);
-
- return (ds->ds_phys->ds_unique_bytes);
-}
-
struct killarg {
dsl_dataset_t *ds;
- zio_t *zio;
dmu_tx_t *tx;
};
/* ARGSUSED */
static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
+ dmu_tx_t *tx = ka->tx;
if (bp == NULL)
return (0);
- if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
- (zb->zb_object != 0 && dnp == NULL)) {
+ if (zb->zb_level == ZB_ZIL_LEVEL) {
+ ASSERT(zilog != NULL);
/*
* It's a block in the intent log. It has no
* accounting, so just free it.
*/
- VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
- ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
+ ASSERT(zilog == NULL);
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
- (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
return (0);
@@ -1217,143 +1298,6 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
/* ARGSUSED */
static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- dmu_objset_type_t *ost = arg2;
-
- /*
- * We can only roll back to emptyness if it is a ZPL objset.
- */
- if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
- return (EINVAL);
-
- /*
- * This must not be a snapshot.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /*
- * If we made changes this txg, traverse_dataset won't find
- * them. Try again.
- */
- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
- return (EAGAIN);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- dmu_objset_type_t *ost = arg2;
- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
- if (ds->ds_user_ptr != NULL) {
- /*
- * We need to make sure that the objset_impl_t is reopened after
- * we do the rollback, otherwise it will have the wrong
- * objset_phys_t. Normally this would happen when this
- * dataset-open is closed, thus causing the
- * dataset to be immediately evicted. But when doing "zfs recv
- * -F", we reopen the objset before that, so that there is no
- * window where the dataset is closed and inconsistent.
- */
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
-
- /* Transfer space that was freed since last snap back to the head. */
- {
- uint64_t used;
-
- VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
- ds->ds_origin_txg, UINT64_MAX, &used));
- dsl_dir_transfer_space(ds->ds_dir, used,
- DD_USED_SNAP, DD_USED_HEAD, tx);
- }
-
- /* Zero out the deadlist. */
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
-
- {
- /*
- * Free blkptrs that we gave birth to - this covers
- * claimed but not played log blocks too.
- */
- zio_t *zio;
- struct killarg ka;
-
- zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED);
- ka.ds = ds;
- ka.zio = zio;
- ka.tx = tx;
- (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- TRAVERSE_POST, kill_blkptr, &ka);
- (void) zio_wait(zio);
- }
-
- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
- if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
- /* Change our contents to that of the prev snapshot */
-
- ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
- ds->ds_prev->ds_phys->ds_used_bytes);
-
- ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
- ds->ds_phys->ds_used_bytes =
- ds->ds_prev->ds_phys->ds_used_bytes;
- ds->ds_phys->ds_compressed_bytes =
- ds->ds_prev->ds_phys->ds_compressed_bytes;
- ds->ds_phys->ds_uncompressed_bytes =
- ds->ds_prev->ds_phys->ds_uncompressed_bytes;
- ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_unique_bytes = 0;
- }
- } else {
- objset_impl_t *osi;
-
- ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
- ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
- ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
-
- bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
- ds->ds_phys->ds_flags = 0;
- ds->ds_phys->ds_unique_bytes = 0;
- if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
- SPA_VERSION_UNIQUE_ACCURATE)
- ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
- osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
- &ds->ds_phys->ds_bp, *ost, tx);
-#ifdef _KERNEL
- zfs_create_fs(&osi->os, kcred, NULL, tx);
-#endif
- }
-
- spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", ds->ds_object);
-}
-
-/* ARGSUSED */
-static int
dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
@@ -1368,7 +1312,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
if (ds->ds_prev != NULL &&
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
+ return (EBUSY);
/*
* This is really a dsl_dir thing, but check it here so that
@@ -1386,7 +1330,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1395,22 +1339,72 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
- spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
}
+static int
+dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
+ dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *ds_prev = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(ds_prev)) {
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ /*
+ * If we're not prepared to remove the origin, don't remove
+ * the clone either.
+ */
+ if (dsda->rm_origin == NULL) {
+ dsda->need_prep = B_TRUE;
+ return (EBUSY);
+ }
+
+ ndsda.ds = ds_prev;
+ ndsda.is_origin_rm = B_TRUE;
+ return (dsl_dataset_destroy_check(&ndsda, tag, tx));
+ }
+
+ /*
+ * If we're not going to remove the origin after all,
+ * undo the open context setup.
+ */
+ if (dsda->rm_origin != NULL) {
+ dsl_dataset_disown(dsda->rm_origin, tag);
+ dsda->rm_origin = NULL;
+ }
+
+ return (0);
+}
+
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
/* ARGSUSED */
int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
/* we have an owner hold, so noone else can destroy us */
ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
- /* Can't delete a branch point. */
- if (ds->ds_phys->ds_num_children > 1)
- return (EEXIST);
+ /*
+ * Only allow deferred destroy on pools that support it.
+ * NOTE: deferred destroy is only supported on snapshots.
+ */
+ if (dsda->defer) {
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ ASSERT(dsl_dataset_is_snapshot(ds));
+ return (0);
+ }
/*
* Can't delete a head dataset if there are snapshots of it.
@@ -1419,7 +1413,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
if (ds->ds_prev != NULL &&
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
+ return (EBUSY);
/*
* If we made changes this txg, traverse_dsl_dataset won't find
@@ -1428,6 +1422,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
return (EAGAIN);
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * If this snapshot has an elevated user reference count,
+ * we can't destroy it yet.
+ */
+ if (ds->ds_userrefs > 0 && !dsda->releasing)
+ return (EBUSY);
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Can't delete a branch point. However, if we're destroying
+ * a clone and removing its origin due to it having a user
+ * hold count of 0 and having been marked for deferred destroy,
+ * it's OK for the origin to have a single clone.
+ */
+ if (ds->ds_phys->ds_num_children >
+ (dsda->is_origin_rm ? 2 : 1)) {
+ mutex_exit(&ds->ds_lock);
+ return (EEXIST);
+ }
+ mutex_exit(&ds->ds_lock);
+ } else if (dsl_dir_is_clone(ds->ds_dir)) {
+ return (dsl_dataset_origin_check(dsda, arg2, tx));
+ }
+
/* XXX we should do some i/o error checking... */
return (0);
}
@@ -1500,24 +1519,132 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
}
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /*
+ * If it is the old version, dd_clones doesn't exist so we can't
+ * find the clones, but deadlist_remove_key() is a no-op so it
+ * doesn't matter.
+ */
+ if (ds->ds_dir->dd_phys->dd_clones == 0)
+ return;
+
+ for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ if (clone->ds_dir->dd_origin_txg > mintxg) {
+ dsl_deadlist_remove_key(&clone->ds_deadlist,
+ mintxg, tx);
+ dsl_dataset_remove_clones_key(clone, mintxg, tx);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+}
+
+struct process_old_arg {
+ dsl_dataset_t *ds;
+ dsl_dataset_t *ds_prev;
+ boolean_t after_branch_point;
+ zio_t *pio;
+ uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct process_old_arg *poa = arg;
+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+ if (poa->ds_prev && !poa->after_branch_point &&
+ bp->blk_birth >
+ poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+ poa->ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dsize_sync(dp->dp_spa, bp);
+ }
+ } else {
+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+ poa->comp += BP_GET_PSIZE(bp);
+ poa->uncomp += BP_GET_UCSIZE(bp);
+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+ }
+ return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+ struct process_old_arg poa = { 0 };
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(ds->ds_deadlist.dl_oldfmt);
+ ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+ poa.ds = ds;
+ poa.ds_prev = ds_prev;
+ poa.after_branch_point = after_branch_point;
+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+ process_old_cb, &poa, tx));
+ VERIFY3U(zio_wait(poa.pio), ==, 0);
+ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -poa.used, -poa.comp, -poa.uncomp, tx);
+
+ /* swap next's deadlist to our deadlist */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_close(&ds_next->ds_deadlist);
+ SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
+ ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj);
+}
+
void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
- zio_t *zio;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
int err;
int after_branch_point = FALSE;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
dsl_dataset_t *ds_prev = NULL;
+ boolean_t wont_destroy;
uint64_t obj;
- ASSERT(ds->ds_owner);
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ wont_destroy = (dsda->defer &&
+ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
+
+ ASSERT(ds->ds_owner || wont_destroy);
+ ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ if (wont_destroy) {
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ return;
+ }
+
/* signal any waiters that this dataset is going away */
mutex_enter(&ds->ds_lock);
ds->ds_owner = dsl_reaper;
@@ -1526,14 +1653,21 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
/* Remove our reservation */
if (ds->ds_reserved != 0) {
- uint64_t val = 0;
- dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+ dsl_prop_setarg_t psa;
+ uint64_t value = 0;
+
+ dsl_prop_setarg_init_uint64(&psa, "refreservation",
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ &value);
+ psa.psa_effective_value = 0; /* predict default value */
+
+ dsl_dataset_set_reservation_sync(ds, &psa, tx);
ASSERT3U(ds->ds_reserved, ==, 0);
}
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
- dsl_pool_ds_destroyed(ds, tx);
+ dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
@@ -1562,26 +1696,36 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
/* This clone is toast. */
ASSERT(ds_prev->ds_phys->ds_num_children > 1);
ds_prev->ds_phys->ds_num_children--;
+
+ /*
+ * If the clone's origin has no other clones, no
+ * user holds, and has been marked for deferred
+ * deletion, then we should have done the necessary
+ * destroy setup for it.
+ */
+ if (ds_prev->ds_phys->ds_num_children == 1 &&
+ ds_prev->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds_prev)) {
+ ASSERT3P(dsda->rm_origin, !=, NULL);
+ } else {
+ ASSERT3P(dsda->rm_origin, ==, NULL);
+ }
} else if (!after_branch_point) {
ds_prev->ds_phys->ds_next_snap_obj =
ds->ds_phys->ds_next_snap_obj;
}
}
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- blkptr_t bp;
+ if (dsl_dataset_is_snapshot(ds)) {
dsl_dataset_t *ds_next;
- uint64_t itor = 0;
uint64_t old_unique;
- int64_t used = 0, compressed = 0, uncompressed = 0;
+ uint64_t used = 0, comp = 0, uncomp = 0;
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
- old_unique = dsl_dataset_unique(ds_next);
+ old_unique = ds_next->ds_phys->ds_unique_bytes;
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
ds_next->ds_phys->ds_prev_snap_obj =
@@ -1591,53 +1735,49 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
- /*
- * Transfer to our deadlist (which will become next's
- * new deadlist) any entries from next's current
- * deadlist which were born before prev, and free the
- * other entries.
- *
- * XXX we're doing this long task with the config lock held
- */
- while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
- if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
- &bp, tx));
- if (ds_prev && !after_branch_point &&
- bp.blk_birth >
- ds_prev->ds_phys->ds_prev_snap_txg) {
- ds_prev->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- } else {
- used += bp_get_dasize(dp->dp_spa, &bp);
- compressed += BP_GET_PSIZE(&bp);
- uncompressed += BP_GET_UCSIZE(&bp);
- /* XXX check return value? */
- (void) dsl_free(zio, dp, tx->tx_txg,
- &bp, NULL, NULL, ARC_NOWAIT);
- }
- }
- ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ if (ds_next->ds_deadlist.dl_oldfmt) {
+ process_old_deadlist(ds, ds_prev, ds_next,
+ after_branch_point, tx);
+ } else {
+ /* Adjust prev's unique space. */
+ if (ds_prev && !after_branch_point) {
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds_prev->ds_phys->ds_prev_snap_txg,
+ ds->ds_phys->ds_prev_snap_txg,
+ &used, &comp, &uncomp);
+ ds_prev->ds_phys->ds_unique_bytes += used;
+ }
- /* change snapused */
- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
- -used, -compressed, -uncompressed, tx);
+ /* Adjust snapused. */
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -comp, -uncomp, tx);
+
+ /* Move blocks to be freed to pool's free list. */
+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+ tx);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+ DD_USED_HEAD, used, comp, uncomp, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
+
+ /* Merge our deadlist into next's and free it. */
+ dsl_deadlist_merge(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_deadlist_obj, tx);
+ }
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
- /* free next's deadlist */
- bplist_close(&ds_next->ds_deadlist);
- bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+ /* Collapse range in clone heads */
+ dsl_dataset_remove_clones_key(ds,
+ ds->ds_phys->ds_creation_txg, tx);
- /* set next's deadlist to our deadlist */
- bplist_close(&ds->ds_deadlist);
- ds_next->ds_phys->ds_deadlist_obj =
- ds->ds_phys->ds_deadlist_obj;
- VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj));
- ds->ds_phys->ds_deadlist_obj = 0;
+ if (dsl_dataset_is_snapshot(ds_next)) {
+ dsl_dataset_t *ds_nextnext;
- if (ds_next->ds_phys->ds_next_snap_obj != 0) {
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
@@ -1646,25 +1786,27 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
- *
- * XXX we're doing this long task with the
- * config lock held
*/
- dsl_dataset_t *ds_after_next;
- uint64_t space;
-
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds_next->ds_phys->ds_next_snap_obj,
- FTAG, &ds_after_next));
-
- VERIFY(0 ==
- bplist_space_birthrange(&ds_after_next->ds_deadlist,
+ FTAG, &ds_nextnext));
+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
ds->ds_phys->ds_prev_snap_txg,
- ds->ds_phys->ds_creation_txg, &space));
- ds_next->ds_phys->ds_unique_bytes += space;
-
- dsl_dataset_rele(ds_after_next, FTAG);
+ ds->ds_phys->ds_creation_txg,
+ &used, &comp, &uncomp);
+ ds_next->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_rele(ds_nextnext, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+ /* Collapse range in this head. */
+ dsl_dataset_t *hds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &hds));
+ dsl_deadlist_remove_key(&hds->ds_deadlist,
+ ds->ds_phys->ds_creation_txg, tx);
+ dsl_dataset_rele(hds, FTAG);
+
} else {
ASSERT3P(ds_next->ds_prev, ==, ds);
dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
@@ -1704,9 +1846,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
*/
struct killarg ka;
- ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
ds->ds_phys->ds_deadlist_obj = 0;
/*
@@ -1717,17 +1858,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
* freed all the objects in open context.
*/
ka.ds = ds;
- ka.zio = zio;
ka.tx = tx;
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0);
+
+ if (ds->ds_prev != NULL) {
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(mos,
+ ds->ds_prev->ds_dir->dd_phys->dd_clones,
+ ds->ds_object, tx));
+ }
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = ds_prev = NULL;
+ }
}
- err = zio_wait(zio);
- ASSERT3U(err, ==, 0);
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
/* Erase the link in the dir */
@@ -1762,8 +1918,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_rele(ds_prev, FTAG);
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
if (ds->ds_phys->ds_next_clones_obj != 0) {
uint64_t count;
@@ -1774,10 +1930,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
}
if (ds->ds_phys->ds_props_obj != 0)
VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+ if (ds->ds_phys->ds_userrefs_obj != 0)
+ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
dsl_dir_close(ds->ds_dir, ds);
ds->ds_dir = NULL;
dsl_dataset_drain_refs(ds, tag);
VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+ if (dsda->rm_origin) {
+ /*
+ * Remove the origin of the clone we just destroyed.
+ */
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ ndsda.ds = dsda->rm_origin;
+ dsl_dataset_destroy_sync(&ndsda, tag, tx);
+ }
}
static int
@@ -1793,8 +1961,9 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
- asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
- if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+ asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
/*
@@ -1807,7 +1976,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
int
dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1848,7 +2016,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *snapname = arg2;
@@ -1919,25 +2087,31 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
- int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ int64_t delta;
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
- add, 0, 0, tx);
+ delta, 0, 0, tx);
}
- bplist_close(&ds->ds_deadlist);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
+ ds->ds_dir->dd_myname, snapname, dsobj,
+ ds->ds_phys->ds_prev_snap_txg);
+ ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
+ UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, tx);
+
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
ds->ds_phys->ds_prev_snap_obj = dsobj;
ds->ds_phys->ds_prev_snap_txg = crtxg;
ds->ds_phys->ds_unique_bytes = 0;
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
- dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
snapname, 8, 1, &dsobj, tx);
ASSERT(err == 0);
@@ -1947,9 +2121,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == dsl_dataset_get_ref(dp,
ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
- dsl_pool_ds_snapshotted(ds, tx);
+ dsl_scan_ds_snapshotted(ds, tx);
- spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
"dataset = %llu", dsobj);
}
@@ -1957,7 +2133,7 @@ void
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_objset != NULL);
ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
/*
@@ -1968,7 +2144,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dsl_dir_dirty(ds->ds_dir, tx);
- dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+ dmu_objset_sync(ds->ds_objset, zio, tx);
}
void
@@ -1992,6 +2168,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
ds->ds_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
ds->ds_phys->ds_guid);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+ ds->ds_phys->ds_unique_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+ ds->ds_object);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+ ds->ds_userrefs);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+ DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
if (ds->ds_phys->ds_next_snap_obj) {
/*
@@ -2075,8 +2259,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
if (ds->ds_prev == NULL)
return (B_FALSE);
if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (B_TRUE);
+ ds->ds_prev->ds_phys->ds_creation_txg) {
+ objset_t *os, *os_prev;
+ /*
+ * It may be that only the ZIL differs, because it was
+ * reset in the head. Don't count that as being
+ * modified.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0)
+ return (B_TRUE);
+ if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+ return (B_TRUE);
+ return (bcmp(&os->os_phys->os_meta_dnode,
+ &os_prev->os_phys->os_meta_dnode,
+ sizeof (os->os_phys->os_meta_dnode)) != 0);
+ }
return (B_FALSE);
}
@@ -2113,8 +2310,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *newsnapname = arg2;
@@ -2138,8 +2334,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
ds->ds_snapname, 8, 1, &ds->ds_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
dsl_dataset_rele(hds, FTAG);
}
@@ -2151,43 +2347,36 @@ struct renamesnaparg {
};
static int
-dsl_snapshot_rename_one(char *name, void *arg)
+dsl_snapshot_rename_one(const char *name, void *arg)
{
struct renamesnaparg *ra = arg;
dsl_dataset_t *ds = NULL;
- char *cp;
+ char *snapname;
int err;
- cp = name + strlen(name);
- *cp = '@';
- (void) strcpy(cp + 1, ra->oldsnap);
+ snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
+ (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
/*
* For recursive snapshot renames the parent won't be changing
* so we just pass name for both the to/from argument.
*/
- err = zfs_secpolicy_rename_perms(name, name, CRED());
- if (err == ENOENT) {
- return (0);
- } else if (err) {
- (void) strcpy(ra->failed, name);
- return (err);
+ err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
+ if (err != 0) {
+ strfree(snapname);
+ return (err == ENOENT ? 0 : err);
}
#ifdef _KERNEL
/*
* For all filesystems undergoing rename, we'll need to unmount it.
*/
- (void) zfs_unmount_snap(name, NULL);
+ (void) zfs_unmount_snap(snapname, NULL);
#endif
- err = dsl_dataset_hold(name, ra->dstg, &ds);
- *cp = '\0';
- if (err == ENOENT) {
- return (0);
- } else if (err) {
- (void) strcpy(ra->failed, name);
- return (err);
- }
+ err = dsl_dataset_hold(snapname, ra->dstg, &ds);
+ strfree(snapname);
+ if (err != 0)
+ return (err == ENOENT ? 0 : err);
dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -2203,7 +2392,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
dsl_sync_task_t *dst;
spa_t *spa;
char *cp, *fsname = spa_strdup(oldname);
- int len = strlen(oldname);
+ int len = strlen(oldname) + 1;
/* truncate the snapshot name to get the fsname */
cp = strchr(fsname, '@');
@@ -2211,7 +2400,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
err = spa_open(fsname, &spa, FTAG);
if (err) {
- kmem_free(fsname, len + 1);
+ kmem_free(fsname, len);
return (err);
}
ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
@@ -2223,7 +2412,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
DS_FIND_CHILDREN);
- kmem_free(fsname, len + 1);
+ kmem_free(fsname, len);
if (err == 0) {
err = dsl_sync_task_group_wait(ra->dstg);
@@ -2234,14 +2423,15 @@ dsl_recursive_rename(char *oldname, const char *newname)
dsl_dataset_t *ds = dst->dst_arg1;
if (dst->dst_err) {
dsl_dir_name(ds->ds_dir, ra->failed);
- (void) strcat(ra->failed, "@");
- (void) strcat(ra->failed, ra->newsnap);
+ (void) strlcat(ra->failed, "@", sizeof (ra->failed));
+ (void) strlcat(ra->failed, ra->newsnap,
+ sizeof (ra->failed));
}
dsl_dataset_rele(ds, ra->dstg);
}
if (err)
- (void) strcpy(oldname, ra->failed);
+ (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
dsl_sync_task_group_destroy(ra->dstg);
kmem_free(ra, sizeof (struct renamesnaparg));
@@ -2250,7 +2440,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
}
static int
-dsl_valid_rename(char *oldname, void *arg)
+dsl_valid_rename(const char *oldname, void *arg)
{
int delta = *(int *)arg;
@@ -2272,12 +2462,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
err = dsl_dir_open(oldname, FTAG, &dd, &tail);
if (err)
return (err);
- /*
- * If there are more than 2 references there may be holds
- * hanging around that haven't been cleared out yet.
- */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
- txg_wait_synced(dd->dd_pool, 0);
+
if (tail == NULL) {
int delta = strlen(newname) - strlen(oldname);
@@ -2286,13 +2471,14 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
err = dmu_objset_find(oldname, dsl_valid_rename,
&delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
- if (!err)
+ if (err == 0)
err = dsl_dir_rename(dd, newname);
dsl_dir_close(dd, FTAG);
return (err);
}
+
if (tail[0] != '@') {
- /* the name ended in a nonexistant component */
+ /* the name ended in a nonexistent component */
dsl_dir_close(dd, FTAG);
return (ENOENT);
}
@@ -2331,13 +2517,14 @@ struct promotenode {
struct promotearg {
list_t shared_snaps, origin_snaps, clone_snaps;
- dsl_dataset_t *origin_origin, *origin_head;
+ dsl_dataset_t *origin_origin;
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+ char *err_ds;
};
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static boolean_t snaplist_unstable(list_t *l);
-/* ARGSUSED */
static int
dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -2346,6 +2533,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
struct promotenode *snap = list_head(&pa->shared_snaps);
dsl_dataset_t *origin_ds = snap->ds;
int err;
+ uint64_t unused;
/* Check that it is a real clone */
if (!dsl_dir_is_clone(hds->ds_dir))
@@ -2361,10 +2549,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* compute origin's new unique space */
snap = list_tail(&pa->clone_snaps);
ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
- err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
- if (err)
- return (err);
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &pa->unique, &unused, &unused);
/*
* Walk the snapshots that we are moving
@@ -2392,18 +2579,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* Check that the snapshot name does not conflict */
VERIFY(0 == dsl_dataset_get_snapname(ds));
err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
- if (err == 0)
- return (EEXIST);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
if (err != ENOENT)
- return (err);
+ goto out;
/* The very first snapshot does not have a deadlist */
if (ds->ds_phys->ds_prev_snap_obj == 0)
continue;
- if (err = bplist_space(&ds->ds_deadlist,
- &dlused, &dlcomp, &dluncomp))
- return (err);
+ dsl_deadlist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp);
pa->used += dlused;
pa->comp += dlcomp;
pa->uncomp += dluncomp;
@@ -2436,19 +2624,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* Note, typically this will not be a clone of a clone,
- * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
- * these snaplist_space() -> bplist_space_birthrange()
+ * so dd_origin_txg will be < TXG_INITIAL, so
+ * these snaplist_space() -> dsl_deadlist_space_range()
* calls will be fast because they do not have to
* iterate over all bps.
*/
snap = list_head(&pa->origin_snaps);
err = snaplist_space(&pa->shared_snaps,
- snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+ snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
if (err)
return (err);
err = snaplist_space(&pa->clone_snaps,
- snap->ds->ds_origin_txg, &space);
+ snap->ds->ds_dir->dd_origin_txg, &space);
if (err)
return (err);
pa->cloneusedsnap += space;
@@ -2461,10 +2649,13 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
return (0);
+out:
+ pa->err_ds = snap->ds->ds_snapname;
+ return (err);
}
static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *hds = arg1;
struct promotearg *pa = arg2;
@@ -2508,10 +2699,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(dd->dd_dbuf, tx);
ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
- hds->ds_origin_txg = origin_head->ds_origin_txg;
+ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
dmu_buf_will_dirty(odd->dd_dbuf, tx);
odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
- origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+ origin_head->ds_dir->dd_origin_txg =
+ origin_ds->ds_phys->ds_creation_txg;
+
+ /* change dd_clone entries */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, hds->ds_object, tx));
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ hds->ds_object, tx));
+
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ origin_head->ds_object, tx));
+ if (dd->dd_phys->dd_clones == 0) {
+ dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+
+ }
/* move snapshots to this dir */
for (snap = list_head(&pa->shared_snaps); snap;
@@ -2519,9 +2731,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_t *ds = snap->ds;
/* unregister props as dsl_dir is changing */
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
}
/* move snap name entry */
VERIFY(0 == dsl_dataset_get_snapname(ds));
@@ -2530,6 +2742,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
+
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
@@ -2539,6 +2752,40 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
+ /* move any clone references */
+ if (ds->ds_phys->ds_next_clones_obj &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *cnds;
+ uint64_t o;
+
+ if (za.za_first_integer == oldnext_obj) {
+ /*
+ * We've already moved the
+ * origin's reference.
+ */
+ continue;
+ }
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &cnds));
+ o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+
+ VERIFY3U(zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, o, tx), ==, 0);
+ VERIFY3U(zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, o, tx), ==, 0);
+ dsl_dataset_rele(cnds, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ }
+
ASSERT3U(dsl_prop_numcb(ds), ==, 0);
}
@@ -2568,8 +2815,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
origin_ds->ds_phys->ds_unique_bytes = pa->unique;
/* log history record */
- spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", hds->ds_object);
+ spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", hds->ds_object);
dsl_dir_close(odd, FTAG);
}
@@ -2634,11 +2881,9 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
*spacep = 0;
for (snap = list_head(l); snap; snap = list_next(l, snap)) {
- uint64_t used;
- int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- mintxg, UINT64_MAX, &used);
- if (err)
- return (err);
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used, &comp, &uncomp);
*spacep += used;
}
return (0);
@@ -2673,7 +2918,7 @@ snaplist_destroy(list_t *l, boolean_t own)
* NULL, indicating that the clone is not a clone of a clone).
*/
int
-dsl_dataset_promote(const char *name)
+dsl_dataset_promote(const char *name, char *conflsnap)
{
dsl_dataset_t *ds;
dsl_dir_t *dd;
@@ -2725,10 +2970,10 @@ dsl_dataset_promote(const char *name)
if (err != 0)
goto out;
- if (dsl_dir_is_clone(snap->ds->ds_dir)) {
- err = dsl_dataset_own_obj(dp,
+ if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
snap->ds->ds_dir->dd_phys->dd_origin_obj,
- 0, FTAG, &pa.origin_origin);
+ FTAG, &pa.origin_origin);
if (err != 0)
goto out;
}
@@ -2744,14 +2989,16 @@ out:
if (err == 0) {
err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
dsl_dataset_promote_sync, ds, &pa,
- 2 + 2 * doi.doi_physical_blks);
+ 2 + 2 * doi.doi_physical_blocks_512);
+ if (err && pa.err_ds && conflsnap)
+ (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
}
snaplist_destroy(&pa.shared_snaps, B_TRUE);
snaplist_destroy(&pa.clone_snaps, B_FALSE);
snaplist_destroy(&pa.origin_snaps, B_FALSE);
if (pa.origin_origin)
- dsl_dataset_disown(pa.origin_origin, FTAG);
+ dsl_dataset_rele(pa.origin_origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -2778,9 +3025,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (csa->cds->ds_prev != csa->ohds->ds_prev)
return (EINVAL);
- /* cds should be the clone */
- if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
- csa->ohds->ds_object)
+ /* cds should be the clone (unless they are unrelated) */
+ if (csa->cds->ds_prev != NULL &&
+ csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
+ csa->ohds->ds_object !=
+ csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
return (EINVAL);
/* the clone should be a child of the origin */
@@ -2803,38 +3052,49 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
+ if (csa->ohds->ds_quota != 0 &&
+ csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
+ return (EDQUOT);
+
return (0);
}
/* ARGSUSED */
static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
struct cloneswaparg *csa = arg1;
dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
ASSERT(csa->cds->ds_reserved == 0);
- ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+ ASSERT(csa->ohds->ds_quota == 0 ||
+ csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
- dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
- if (csa->cds->ds_user_ptr != NULL) {
- csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
- csa->cds->ds_user_ptr = NULL;
+ if (csa->cds->ds_objset != NULL) {
+ dmu_objset_evict(csa->cds->ds_objset);
+ csa->cds->ds_objset = NULL;
}
- if (csa->ohds->ds_user_ptr != NULL) {
- csa->ohds->ds_user_evict_func(csa->ohds,
- csa->ohds->ds_user_ptr);
- csa->ohds->ds_user_ptr = NULL;
+ if (csa->ohds->ds_objset != NULL) {
+ dmu_objset_evict(csa->ohds->ds_objset);
+ csa->ohds->ds_objset = NULL;
}
- /* reset origin's unique bytes */
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
- &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+ /*
+ * Reset origin's unique bytes, if it exists.
+ */
+ if (csa->cds->ds_prev) {
+ dsl_dataset_t *origin = csa->cds->ds_prev;
+ uint64_t comp, uncomp;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+ origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+ }
/* swap blkptrs */
{
@@ -2853,10 +3113,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT3U(csa->cds->ds_dir->dd_phys->
dd_used_breakdown[DD_USED_SNAP], ==, 0);
- VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
- &cdl_comp, &cdl_uncomp));
- VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
- &odl_comp, &odl_uncomp));
+ dsl_deadlist_space(&csa->cds->ds_deadlist,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space(&csa->ohds->ds_deadlist,
+ &odl_used, &odl_comp, &odl_uncomp);
dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
(csa->ohds->ds_phys->ds_used_bytes + odl_used);
@@ -2877,21 +3137,16 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
- VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &odl_used, &odl_comp, &odl_uncomp);
dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
-#define SWITCH64(x, y) \
- { \
- uint64_t __tmp = (x); \
- (x) = (y); \
- (y) = __tmp; \
- }
-
/* swap ds_*_bytes */
SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
csa->cds->ds_phys->ds_used_bytes);
@@ -2906,22 +3161,26 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
csa->unused_refres_delta, 0, 0, tx);
- /* swap deadlists */
- bplist_close(&csa->cds->ds_deadlist);
- bplist_close(&csa->ohds->ds_deadlist);
+ /*
+ * Swap deadlists.
+ */
+ dsl_deadlist_close(&csa->cds->ds_deadlist);
+ dsl_deadlist_close(&csa->ohds->ds_deadlist);
SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
csa->cds->ds_phys->ds_deadlist_obj);
- VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
- csa->cds->ds_phys->ds_deadlist_obj));
- VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
- csa->ohds->ds_phys->ds_deadlist_obj));
+ dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+ csa->cds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+ csa->ohds->ds_phys->ds_deadlist_obj);
- dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+ dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
}
/*
- * Swap 'clone' with its origin head file system. Used at the end
- * of "online recv" to swizzle the file system to the new version.
+ * Swap 'clone' with its origin head datasets. Used at the end of "zfs
+ * recv" into an existing fs to swizzle the file system to the new
+ * version, and by "zfs rollback". Can also be used to swap two
+ * independent head datasets if neither has any snapshots.
*/
int
dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
@@ -2933,9 +3192,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
ASSERT(clone->ds_owner);
ASSERT(origin_head->ds_owner);
retry:
- /* Need exclusive access for the swap */
- rw_enter(&clone->ds_rwlock, RW_WRITER);
- if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+ /*
+ * Need exclusive access for the swap. If we're swapping these
+ * datasets back after an error, we already hold the locks.
+ */
+ if (!RW_WRITE_HELD(&clone->ds_rwlock))
+ rw_enter(&clone->ds_rwlock, RW_WRITER);
+ if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
+ !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
rw_exit(&clone->ds_rwlock);
rw_enter(&origin_head->ds_rwlock, RW_WRITER);
if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
@@ -3030,62 +3294,70 @@ static int
dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
+ dsl_prop_setarg_t *psa = arg2;
+ int err;
if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
return (ENOTSUP);
- if (new_quota == 0)
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ if (psa->psa_effective_value == 0)
return (0);
- if (new_quota < ds->ds_phys->ds_used_bytes ||
- new_quota < ds->ds_reserved)
+ if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+ psa->psa_effective_value < ds->ds_reserved)
return (ENOSPC);
return (0);
}
-/* ARGSUSED */
+extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
+
void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
- ds->ds_quota = new_quota;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
- dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+ if (ds->ds_quota != effective_value) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_quota = effective_value;
- spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu ",
- (longlong_t)new_quota, ds->ds_object);
+ spa_history_log_internal(LOG_DS_REFQUOTA,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
+ (longlong_t)ds->ds_quota, ds->ds_object);
+ }
}
int
-dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
{
dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
+ dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
+
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
- if (quota != ds->ds_quota) {
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(ds->ds_dir->dd_pool, 0);
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+ ds, &psa, 0);
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
- ds, &quota, 0);
- }
dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -3094,9 +3366,10 @@ static int
dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value;
uint64_t unique;
+ int err;
if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
SPA_VERSION_REFRESERVATION)
@@ -3105,6 +3378,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (dsl_dataset_is_snapshot(ds))
return (EINVAL);
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ effective_value = psa->psa_effective_value;
+
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
@@ -3113,67 +3391,645 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
+ if (!DS_UNIQUE_IS_ACCURATE(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+ unique = ds->ds_phys->ds_unique_bytes;
mutex_exit(&ds->ds_lock);
- if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
- uint64_t delta = MAX(unique, new_reservation) -
+ if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
+ uint64_t delta = MAX(unique, effective_value) -
MAX(unique, ds->ds_reserved);
if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
if (ds->ds_quota > 0 &&
- new_reservation > ds->ds_quota)
+ effective_value > ds->ds_quota)
return (ENOSPC);
}
return (0);
}
-/* ARGSUSED */
static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx)
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
uint64_t unique;
int64_t delta;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
- delta = MAX(0, (int64_t)(new_reservation - unique)) -
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ unique = ds->ds_phys->ds_unique_bytes;
+ delta = MAX(0, (int64_t)(effective_value - unique)) -
MAX(0, (int64_t)(ds->ds_reserved - unique));
- ds->ds_reserved = new_reservation;
+ ds->ds_reserved = effective_value;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
mutex_exit(&ds->ds_dir->dd_lock);
- dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
- new_reservation, cr, tx);
- spa_history_internal_log(LOG_DS_REFRESERV,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
- (longlong_t)new_reservation, ds->ds_object);
+ spa_history_log_internal(LOG_DS_REFRESERV,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
+ (longlong_t)effective_value, ds->ds_object);
}
int
-dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation)
{
dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
+ dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
+ &reservation);
+
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_set_reservation_check,
- dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+ dsl_dataset_set_reservation_sync, ds, &psa, 0);
+
dsl_dataset_rele(ds, FTAG);
return (err);
}
+
+typedef struct zfs_hold_cleanup_arg {
+ dsl_pool_t *dp;
+ uint64_t dsobj;
+ char htag[MAXNAMELEN];
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+ zfs_hold_cleanup_arg_t *ca = arg;
+
+ (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
+ B_TRUE);
+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+}
+
+void
+dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor)
+{
+ zfs_hold_cleanup_arg_t *ca;
+
+ ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
+ ca->dp = ds->ds_dir->dd_pool;
+ ca->dsobj = ds->ds_object;
+ (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
+ VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
+ dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
+static int
+dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct dsl_ds_holdarg *ha = arg2;
+ char *htag = ha->htag;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int error = 0;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ if (!dsl_dataset_is_snapshot(ds))
+ return (EINVAL);
+
+ /* tags must be unique */
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj) {
+ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
+ 8, 1, tx);
+ if (error == 0)
+ error = EEXIST;
+ else if (error == ENOENT)
+ error = 0;
+ }
+ mutex_exit(&ds->ds_lock);
+
+ if (error == 0 && ha->temphold &&
+ strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ error = E2BIG;
+
+ return (error);
+}
+
+void
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct dsl_ds_holdarg *ha = arg2;
+ char *htag = ha->htag;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t now = gethrestime_sec();
+ uint64_t zapobj;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj == 0) {
+ /*
+ * This is the first user hold for this dataset. Create
+ * the userrefs zap object.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zapobj = ds->ds_phys->ds_userrefs_obj =
+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+ } else {
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ }
+ ds->ds_userrefs++;
+ mutex_exit(&ds->ds_lock);
+
+ VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+ if (ha->temphold) {
+ VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
+ htag, &now, tx));
+ }
+
+ spa_history_log_internal(LOG_DS_USER_HOLD,
+ dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
+ (int)ha->temphold, ds->ds_object);
+}
+
+static int
+dsl_dataset_user_hold_one(const char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ dsl_dataset_t *ds;
+ int error;
+ char *name;
+
+ /* alloc a buffer to hold dsname@snapname plus terminating NULL */
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, ha->dstg, &ds);
+ strfree(name);
+ if (error == 0) {
+ ha->gotone = B_TRUE;
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
+ dsl_dataset_user_hold_sync, ds, ha, 0);
+ } else if (error == ENOENT && ha->recursive) {
+ error = 0;
+ } else {
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+ }
+ return (error);
+}
+
+int
+dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold)
+{
+ struct dsl_ds_holdarg *ha;
+ int error;
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ ha->htag = htag;
+ ha->temphold = temphold;
+ error = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
+ ds, ha, 0);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+
+ return (error);
+}
+
+int
+dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive, boolean_t temphold, int cleanup_fd)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+ minor_t minor = 0;
+
+ if (cleanup_fd != -1) {
+ /* Currently we only support cleanup-on-exit of tempholds. */
+ if (!temphold)
+ return (EINVAL);
+ error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (error)
+ return (error);
+ }
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ ha->temphold = temphold;
+
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_hold_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, ha->failed);
+ *strchr(ha->failed, '@') = '\0';
+ } else if (error == 0 && minor != 0 && temphold) {
+ /*
+ * If this hold is to be released upon process exit,
+ * register that action now.
+ */
+ dsl_register_onexit_hold_cleanup(ds, htag, minor);
+ }
+ dsl_dataset_rele(ds, ha->dstg);
+ }
+
+ if (error == 0 && recursive && !ha->gotone)
+ error = ENOENT;
+
+ if (error)
+ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+ dsl_sync_task_group_destroy(ha->dstg);
+
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (error);
+}
+
+struct dsl_ds_releasearg {
+ dsl_dataset_t *ds;
+ const char *htag;
+ boolean_t own; /* do we own or just hold ds? */
+};
+
+static int
+dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
+ boolean_t *might_destroy)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t tmp;
+ int error;
+
+ *might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ if (zapobj == 0) {
+ /* The tag can't possibly exist */
+ mutex_exit(&ds->ds_lock);
+ return (ESRCH);
+ }
+
+ /* Make sure the tag exists */
+ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
+ if (error) {
+ mutex_exit(&ds->ds_lock);
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds))
+ *might_destroy = B_TRUE;
+
+ mutex_exit(&ds->ds_lock);
+ return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ boolean_t might_destroy;
+ int error;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
+ if (error)
+ return (error);
+
+ if (might_destroy) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ if (dmu_tx_is_syncing(tx)) {
+ /*
+ * If we're not prepared to remove the snapshot,
+ * we can't allow the release to happen right now.
+ */
+ if (!ra->own)
+ return (EBUSY);
+ }
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ return (dsl_dataset_destroy_check(&dsda, tag, tx));
+ }
+
+ return (0);
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t dsobj = ds->ds_object;
+ uint64_t refs;
+ int error;
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_userrefs--;
+ refs = ds->ds_userrefs;
+ mutex_exit(&ds->ds_lock);
+ error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
+ VERIFY(error == 0 || error == ENOENT);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
+ if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds)) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ ASSERT(ra->own);
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ /* We already did the destroy_check */
+ dsl_dataset_destroy_sync(&dsda, tag, tx);
+ }
+
+ spa_history_log_internal(LOG_DS_USER_RELEASE,
+ dp->dp_spa, tx, "<%s> %lld dataset = %llu",
+ ra->htag, (longlong_t)refs, dsobj);
+}
+
+static int
+dsl_dataset_user_release_one(const char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ struct dsl_ds_releasearg *ra;
+ dsl_dataset_t *ds;
+ int error;
+ void *dtag = ha->dstg;
+ char *name;
+ boolean_t own = B_FALSE;
+ boolean_t might_destroy;
+
+ /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, dtag, &ds);
+ strfree(name);
+ if (error == ENOENT && ha->recursive)
+ return (0);
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+ if (error)
+ return (error);
+
+ ha->gotone = B_TRUE;
+
+ ASSERT(dsl_dataset_is_snapshot(ds));
+
+ error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+
+ if (might_destroy) {
+#ifdef _KERNEL
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = zfs_unmount_snap(name, NULL);
+ strfree(name);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+#endif
+ if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
+ dsl_dataset_rele(ds, dtag);
+ return (EBUSY);
+ } else {
+ own = B_TRUE;
+ dsl_dataset_make_exclusive(ds, dtag);
+ }
+ }
+
+ ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
+ ra->ds = ds;
+ ra->htag = ha->htag;
+ ra->own = own;
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
+ dsl_dataset_user_release_sync, ra, dtag, 0);
+
+ return (0);
+}
+
+int
+dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+
+top:
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_release_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ struct dsl_ds_releasearg *ra = dst->dst_arg1;
+ dsl_dataset_t *ds = ra->ds;
+
+ if (dst->dst_err)
+ dsl_dataset_name(ds, ha->failed);
+
+ if (ra->own)
+ dsl_dataset_disown(ds, ha->dstg);
+ else
+ dsl_dataset_rele(ds, ha->dstg);
+
+ kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+ }
+
+ if (error == 0 && recursive && !ha->gotone)
+ error = ENOENT;
+
+ if (error && error != EBUSY)
+ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+ dsl_sync_task_group_destroy(ha->dstg);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+
+ /*
+ * We can get EBUSY if we were racing with deferred destroy and
+ * dsl_dataset_user_release_check() hadn't done the necessary
+ * open context setup. We can also get EBUSY if we're racing
+ * with destroy and that thread is the ds_owner. Either way
+ * the busy condition should be transient, and we should retry
+ * the release operation.
+ */
+ if (error == EBUSY)
+ goto top;
+
+ return (error);
+}
+
+/*
+ * Called at spa_load time (with retry == B_FALSE) to release a stale
+ * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
+ */
+int
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
+ boolean_t retry)
+{
+ dsl_dataset_t *ds;
+ char *snap;
+ char *name;
+ int namelen;
+ int error;
+
+ do {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error)
+ return (error);
+ namelen = dsl_dataset_namelen(ds)+1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(ds, name);
+ dsl_dataset_rele(ds, FTAG);
+
+ snap = strchr(name, '@');
+ *snap = '\0';
+ ++snap;
+ error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
+ kmem_free(name, namelen);
+
+ /*
+ * The object can't have been destroyed because we have a hold,
+ * but it might have been renamed, resulting in ENOENT. Retry
+ * if we've been requested to do so.
+ *
+ * It would be nice if we could use the dsobj all the way
+ * through and avoid ENOENT entirely. But we might need to
+ * unmount the snapshot, and there's currently no way to lookup
+ * a vfsp using a ZFS object id.
+ */
+ } while ((error == ENOENT) && retry);
+
+ return (error);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (err)
+ return (err);
+
+ VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
+ if (ds->ds_phys->ds_userrefs_obj != 0) {
+ zap_attribute_t *za;
+ zap_cursor_t zc;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
+ za->za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/*
+ * Note, this fuction is used as the callback for dmu_objset_find(). We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+ dsl_dataset_t *ds;
+
+ if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
+ if (DS_IS_INCONSISTENT(ds))
+ (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
+ else
+ dsl_dataset_disown(ds, FTAG);
+ }
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
new file mode 100644
index 000000000000..064f8aceb8ee
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_entry_t *dle1 = arg1;
+ const dsl_deadlist_entry_t *dle2 = arg2;
+
+ if (dle1->dle_mintxg < dle2->dle_mintxg)
+ return (-1);
+ else if (dle1->dle_mintxg > dle2->dle_mintxg)
+ return (+1);
+ else
+ return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havetree)
+ return;
+
+ avl_create(&dl->dl_tree, dsl_deadlist_compare,
+ sizeof (dsl_deadlist_entry_t),
+ offsetof(dsl_deadlist_entry_t, dle_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+ za.za_first_integer));
+ avl_add(&dl->dl_tree, dle);
+ }
+ zap_cursor_fini(&zc);
+ dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+ dl->dl_os = os;
+ dl->dl_object = object;
+ VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+ dmu_object_info_from_db(dl->dl_dbuf, &doi);
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_oldfmt = B_TRUE;
+ VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+ return;
+ }
+
+ dl->dl_oldfmt = B_FALSE;
+ dl->dl_phys = dl->dl_dbuf->db_data;
+ dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+ void *cookie = NULL;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt) {
+ dl->dl_oldfmt = B_FALSE;
+ bpobj_close(&dl->dl_bpobj);
+ return;
+ }
+
+ if (dl->dl_havetree) {
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+ != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+ }
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ mutex_destroy(&dl->dl_lock);
+ dl->dl_dbuf = NULL;
+ dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+ sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+ dmu_object_info_t doi;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_free(os, dlobj, tx);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, os, dlobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc))
+ bpobj_free(os, za.za_first_integer, tx);
+ zap_cursor_fini(&zc);
+ VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used +=
+ bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+ dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+ dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = bp->blk_birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ else
+ dle = AVL_PREV(&dl->dl_tree, dle);
+ bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = mintxg;
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ avl_add(&dl->dl_tree, dle);
+
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+ mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle, *dle_prev;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+ bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+ dle->dle_bpobj.bpo_object, tx);
+
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_t dl;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ dsl_deadlist_open(&dl, os, dlobj);
+ if (dl.dl_oldfmt) {
+ dsl_deadlist_close(&dl);
+ return;
+ }
+
+ while (mrs_obj != 0) {
+ dsl_dataset_t *ds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+ dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+ mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t *dle;
+ uint64_t newobj;
+
+ newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+ if (dl->dl_oldfmt) {
+ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+ return (newobj);
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t obj;
+
+ if (dle->dle_mintxg >= maxtxg)
+ break;
+
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+ dle->dle_mintxg, obj, tx));
+ }
+ return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+ usedp, compp, uncompp));
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ *usedp = dl->dl_phys->dl_used;
+ *compp = dl->dl_phys->dl_comp;
+ *uncompp = dl->dl_phys->dl_uncomp;
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+ mintxg, maxtxg, usedp, compp, uncompp));
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+ *usedp = *compp = *uncompp = 0;
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ /*
+ * If we don't find this mintxg, there shouldn't be anything
+ * after it either.
+ */
+ ASSERT(dle != NULL ||
+ avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+ for (; dle && dle->dle_mintxg < maxtxg;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t used, comp, uncomp;
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+ }
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+ uint64_t used, comp, uncomp;
+ bpobj_t bpo;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+ bpobj_close(&bpo);
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used += used;
+ dl->dl_phys->dl_comp += comp;
+ dl->dl_phys->dl_uncomp += uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_buf_t *bonus;
+ dsl_deadlist_phys_t *dlp;
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_t bpo;
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_iterate(&bpo,
+ dsl_deadlist_insert_cb, dl, tx));
+ bpobj_close(&bpo);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, dl->dl_os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t mintxg = strtonum(za.za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+ dlp = bonus->db_data;
+ dmu_buf_will_dirty(bonus, tx);
+ bzero(dlp, sizeof (*dlp));
+ dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(!dl->dl_oldfmt);
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ while (dle) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t *dle_next;
+
+ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+ mutex_enter(&dl->dl_lock);
+ ASSERT3U(dl->dl_phys->dl_used, >=, used);
+ ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, tx));
+
+ dle_next = AVL_NEXT(&dl->dl_tree, dle);
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ dle = dle_next;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
index 7ff843044885..b85c373e3d03 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -75,8 +74,6 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/cred.h>
@@ -150,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
}
static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -185,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(zap_update(mos, jumpobj,
perm, 8, 1, &n, tx) == 0);
- spa_history_internal_log(LOG_DS_PERM_UPDATE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_UPDATE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
@@ -194,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -217,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
(void) zap_remove(mos, zapobj, whokey, tx);
VERIFY(0 == zap_destroy(mos, jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s dataset = %llu", whokey,
dd->dd_phys->dd_head_dataset_obj);
continue;
@@ -238,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_destroy(mos,
jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
@@ -531,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
* Check if user has requested permission.
*/
int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
{
- dsl_dataset_t *ds;
dsl_dir_t *dd;
dsl_pool_t *dp;
void *cookie;
@@ -543,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
avl_tree_t permsets;
perm_set_t *setnode;
- error = dsl_dataset_hold(dsname, FTAG, &ds);
- if (error)
- return (error);
-
dp = ds->ds_dir->dd_pool;
mos = dp->dp_meta_objset;
- if (dsl_delegation_on(mos) == B_FALSE) {
- dsl_dataset_rele(ds, FTAG);
+ if (dsl_delegation_on(mos) == B_FALSE)
return (ECANCELED);
- }
if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
- SPA_VERSION_DELEGATED_PERMS) {
- dsl_dataset_rele(ds, FTAG);
+ SPA_VERSION_DELEGATED_PERMS)
return (EPERM);
- }
if (dsl_dataset_is_snapshot(ds)) {
/*
@@ -589,7 +577,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
if (dsl_prop_get_dd(dd,
zfs_prop_to_name(ZFS_PROP_ZONED),
- 8, 1, &zoned, NULL) != 0)
+ 8, 1, &zoned, NULL, B_FALSE) != 0)
break;
if (!zoned)
break;
@@ -636,7 +624,6 @@ again:
error = EPERM;
success:
rw_exit(&dp->dp_config_rwlock);
- dsl_dataset_rele(ds, FTAG);
cookie = NULL;
while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -645,6 +632,22 @@ success:
return (error);
}
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (error)
+ return (error);
+
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
/*
* Other routines.
*/
@@ -739,5 +742,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
boolean_t
dsl_delegation_on(objset_t *os)
{
- return (os->os->os_spa->spa_delegation);
+ return (!!spa_delegation(os->os_spa));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 2f312ae3410c..1cd49c8274e8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -32,6 +31,7 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
+#include <sys/metaslab.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/arc.h>
@@ -39,8 +39,7 @@
#include "zfs_namecheck.h"
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
/* ARGSUSED */
@@ -63,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
spa_close(dd->dd_pool->dp_spa, dd);
/*
- * The props callback list should be empty since they hold the
- * dir open.
+ * The props callback list should have been cleaned up by
+ * objset_evict().
*/
list_destroy(&dd->dd_prop_cbs);
mutex_destroy(&dd->dd_lock);
@@ -107,6 +106,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_node));
+ dsl_dir_snap_cmtime_update(dd);
+
if (dd->dd_phys->dd_parent_obj) {
err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
NULL, dd, &dd->dd_parent);
@@ -133,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
}
+ if (dsl_dir_is_clone(dd)) {
+ dmu_buf_t *origin_bonus;
+ dsl_dataset_phys_t *origin_phys;
+
+ /*
+ * We can't open the origin dataset, because
+ * that would require opening this dsl_dir.
+ * Just look at its phys directly instead.
+ */
+ err = dmu_bonus_hold(dp->dp_meta_objset,
+ dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+ if (err)
+ goto errout;
+ origin_phys = origin_bonus->db_data;
+ dd->dd_origin_txg =
+ origin_phys->ds_creation_txg;
+ dmu_buf_rele(origin_bonus, FTAG);
+ }
+
winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
dsl_dir_evict);
if (winner) {
@@ -392,7 +412,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
{
objset_t *mos = dp->dp_meta_objset;
uint64_t ddobj;
- dsl_dir_phys_t *dsphys;
+ dsl_dir_phys_t *ddphys;
dmu_buf_t *dbuf;
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@@ -407,17 +427,17 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
}
VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
+ ddphys = dbuf->db_data;
- dsphys->dd_creation_time = gethrestime_sec();
+ ddphys->dd_creation_time = gethrestime_sec();
if (pds)
- dsphys->dd_parent_obj = pds->dd_object;
- dsphys->dd_props_zapobj = zap_create(mos,
+ ddphys->dd_parent_obj = pds->dd_object;
+ ddphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsphys->dd_child_dir_zapobj = zap_create(mos,
+ ddphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
- dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
@@ -427,7 +447,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
int
dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
dsl_pool_t *dp = dd->dd_pool;
objset_t *mos = dp->dp_meta_objset;
int err;
@@ -454,19 +475,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t val, obj;
+ dsl_prop_setarg_t psa;
+ uint64_t value = 0;
+ uint64_t obj;
dd_used_t t;
ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
/* Remove our reservation. */
- val = 0;
- dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+ dsl_prop_setarg_init_uint64(&psa, "reservation",
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ &value);
+ psa.psa_effective_value = 0; /* predict default value */
+
+ dsl_dir_set_reservation_sync(ds, &psa, tx);
+
ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
for (t = 0; t < DD_USED_NUM; t++)
@@ -640,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
if (used > quota) {
/* over quota */
myspace = 0;
-
- /*
- * While it's OK to be a little over quota, if
- * we think we are using more space than there
- * is in the pool (which is already 1.6% more than
- * dsl_pool_adjustedsize()), something is very
- * wrong.
- */
- ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
} else {
/*
* the lesser of the space provided by our parent and
@@ -676,8 +696,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
{
uint64_t txg = tx->tx_txg;
uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+ uint64_t deferred = 0;
struct tempreserve *tr;
- int enospc = EDQUOT;
+ int retval = EDQUOT;
int txgidx = txg & TXG_MASK;
int i;
uint64_t ref_rsrv = 0;
@@ -703,7 +724,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
*/
if (first && tx->tx_objset) {
int error;
- dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+ dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
error = dsl_dataset_check_quota(ds, checkrefquota,
asize, est_inflight, &used_on_disk, &ref_rsrv);
@@ -723,7 +744,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
quota = dd->dd_phys->dd_quota;
/*
- * Adjust the quota against the actual pool size at the root.
+ * Adjust the quota against the actual pool size at the root
+ * minus any outstanding deferred frees.
* To ensure that it's possible to remove files from a full
* pool without inducing transient overcommits, we throttle
* netfree transactions against a quota that is slightly larger,
@@ -732,10 +754,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* removes to get through.
*/
if (dd->dd_parent == NULL) {
+ spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
- if (poolsize < quota) {
- quota = poolsize;
- enospc = ENOSPC;
+ deferred = metaslab_class_get_deferred(spa_normal_class(spa));
+ if (poolsize - deferred < quota) {
+ quota = poolsize - deferred;
+ retval = ENOSPC;
}
}
@@ -745,15 +769,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
- if (used_on_disk + est_inflight > quota) {
- if (est_inflight > 0 || used_on_disk < quota)
- enospc = ERESTART;
+ if (used_on_disk + est_inflight >= quota) {
+ if (est_inflight > 0 || used_on_disk < quota ||
+ (retval == ENOSPC && used_on_disk < quota + deferred))
+ retval = ERESTART;
dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
"quota=%lluK tr=%lluK err=%d\n",
used_on_disk>>10, est_inflight>>10,
- quota>>10, asize>>10, enospc);
+ quota>>10, asize>>10, retval);
mutex_exit(&dd->dd_lock);
- return (enospc);
+ return (retval);
}
/* We need to up our estimated delta before dropping dd_lock */
@@ -987,13 +1012,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
static int
dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
- int err = 0;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ int err;
uint64_t towrite;
- if (new_quota == 0)
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ if (psa->psa_effective_value == 0)
return (0);
mutex_enter(&dd->dd_lock);
@@ -1005,64 +1033,88 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
towrite = dsl_dir_space_towrite(dd);
if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
- (new_quota < dd->dd_phys->dd_reserved ||
- new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
+ (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
+ psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
err = ENOSPC;
}
mutex_exit(&dd->dd_lock);
return (err);
}
-/* ARGSUSED */
+extern dsl_syncfunc_t dsl_prop_set_sync;
+
static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
+
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(dd, psa);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
- dd->dd_phys->dd_quota = new_quota;
+ dd->dd_phys->dd_quota = effective_value;
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu ",
- (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu ",
+ (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
{
dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+
+ err = dsl_dataset_hold(ddname, FTAG, &ds);
if (err)
return (err);
- if (quota != dd->dd_phys->dd_quota) {
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(dd->dd_pool, 0);
-
- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, dd, &quota, 0);
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
}
+
+ ASSERT(ds->ds_dir == dd);
+
+ /*
+ * If someone removes a file, then tries to set the quota, we want to
+ * make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, ds, &psa, 0);
+
dsl_dir_close(dd, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
int
dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value;
uint64_t used, avail;
+ int err;
+
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ effective_value = psa->psa_effective_value;
/*
* If we are doing the preliminary check in open context, the
@@ -1082,37 +1134,40 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
}
- if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
- uint64_t delta = MAX(used, new_reservation) -
+ if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
+ uint64_t delta = MAX(used, effective_value) -
MAX(used, dd->dd_phys->dd_reserved);
if (delta > avail)
return (ENOSPC);
if (dd->dd_phys->dd_quota > 0 &&
- new_reservation > dd->dd_phys->dd_quota)
+ effective_value > dd->dd_phys->dd_quota)
return (ENOSPC);
}
return (0);
}
-/* ARGSUSED */
static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
uint64_t used;
int64_t delta;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(dd, psa);
+
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
used = dd->dd_phys->dd_used_bytes;
- delta = MAX(used, new_reservation) -
+ delta = MAX(used, effective_value) -
MAX(used, dd->dd_phys->dd_reserved);
- dd->dd_phys->dd_reserved = new_reservation;
+ dd->dd_phys->dd_reserved = effective_value;
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
@@ -1121,23 +1176,39 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu",
- (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu",
+ (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation)
{
dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+
+ err = dsl_dataset_hold(ddname, FTAG, &ds);
if (err)
return (err);
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ ASSERT(ds->ds_dir == dd);
+
err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
- dsl_dir_set_reservation_sync, dd, &reservation, 0);
+ dsl_dir_set_reservation_sync, ds, &psa, 0);
+
dsl_dir_close(dd, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -1175,7 +1246,6 @@ struct renamearg {
const char *mynewname;
};
-/*ARGSUSED*/
static int
dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1186,8 +1256,14 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
int err;
uint64_t val;
- /* There should be 2 references: the open and the dirty */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ /*
+ * There should only be one reference, from dmu_objset_rename().
+ * Fleeting holds are also possible (eg, from "zfs list" getting
+ * stats), but any that are present in open context will likely
+ * be gone by syncing context, so only fail from syncing
+ * context.
+ */
+ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
return (EBUSY);
/* check for existing name */
@@ -1216,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct renamearg *ra = arg2;
@@ -1265,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dd->dd_myname, 8, 1, &dd->dd_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+ tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
}
int
@@ -1315,3 +1391,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
return (0);
}
+
+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ mutex_enter(&dd->dd_lock);
+ t = dd->dd_snap_cmtime;
+ mutex_exit(&dd->dd_lock);
+
+ return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ gethrestime(&t);
+ mutex_enter(&dd->dd_lock);
+ dd->dd_snap_cmtime = t;
+ mutex_exit(&dd->dd_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 0f00bc965dcd..ea5e60d933e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -19,14 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
@@ -36,22 +38,47 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
-int zfs_txg_synctime = 5; /* target secs to sync a txg */
+int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
uint64_t zfs_write_limit_inflated = 0;
uint64_t zfs_write_limit_override = 0;
-extern uint64_t zfs_write_limit_min;
kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
-static int
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN,
+ &zfs_no_write_throttle, 0, "");
+TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN,
+ &zfs_write_limit_shift, 0, "2^N of physical memory");
+SYSCTL_DECL(_vfs_zfs_txg);
+TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms);
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN,
+ &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg");
+
+TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN,
+ &zfs_write_limit_min, 0, "Minimum write limit");
+TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN,
+ &zfs_write_limit_max, 0, "Maximum data payload per txg");
+TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN,
+ &zfs_write_limit_inflated, 0, "");
+TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
+ &zfs_write_limit_override, 0, "");
+
+int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
uint64_t obj;
@@ -89,7 +116,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_dataset_t, ds_synced_link));
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
@@ -104,13 +130,13 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dsl_dir_t *dd;
dsl_dataset_t *ds;
- objset_impl_t *osi;
+ uint64_t obj;
rw_enter(&dp->dp_config_rwlock, RW_WRITER);
- err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+ &dp->dp_meta_objset);
if (err)
goto out;
- dp->dp_meta_objset = &osi->os;
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
@@ -135,8 +161,8 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
FTAG, &ds);
if (err == 0) {
err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, dp,
- &dp->dp_origin_snap);
+ ds->ds_phys->ds_prev_snap_obj, dp,
+ &dp->dp_origin_snap);
dsl_dataset_rele(ds, FTAG);
}
dsl_dir_close(dd, dp);
@@ -144,53 +170,30 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
goto out;
}
- /* get scrub status */
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func);
- if (err == 0) {
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark);
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+ &dp->dp_free_dir);
if (err)
goto out;
+
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors);
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
if (err)
goto out;
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- /*
- * A new-type scrub was in progress on an old
- * pool. Restart from the beginning, since the
- * old software may have changed the pool in the
- * meantime.
- */
- dsl_pool_scrub_restart(dp);
- }
- } else {
- /*
- * It's OK if there is no scrub in progress (and if
- * there was an I/O error, ignore it).
- */
- err = 0;
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
}
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+ &dp->dp_tmp_userrefs_obj);
+ if (err == ENOENT)
+ err = 0;
+ if (err)
+ goto out;
+
+ err = dsl_scan_init(dp, txg);
+
out:
rw_exit(&dp->dp_config_rwlock);
if (err)
@@ -215,23 +218,27 @@ dsl_pool_close(dsl_pool_t *dp)
dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
if (dp->dp_mos_dir)
dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_free_dir)
+ dsl_dir_close(dp->dp_free_dir, dp);
if (dp->dp_root_dir)
dsl_dir_close(dp->dp_root_dir, dp);
+ bpobj_close(&dp->dp_free_bpobj);
+
/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
if (dp->dp_meta_objset)
- dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+ dmu_objset_evict(dp->dp_meta_objset);
txg_list_destroy(&dp->dp_dirty_datasets);
- txg_list_destroy(&dp->dp_dirty_dirs);
txg_list_destroy(&dp->dp_sync_tasks);
+ txg_list_destroy(&dp->dp_dirty_dirs);
list_destroy(&dp->dp_synced_datasets);
arc_flush(dp->dp_spa);
txg_fini(dp);
+ dsl_scan_fini(dp);
rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
- mutex_destroy(&dp->dp_scrub_cancel_lock);
taskq_destroy(dp->dp_vnrele_taskq);
if (dp->dp_blkstats)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -244,19 +251,22 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
- objset_impl_t *osip;
+ objset_t *os;
dsl_dataset_t *ds;
- uint64_t dsobj;
+ uint64_t obj;
/* create and open the MOS (meta-objset) */
- dp->dp_meta_objset = &dmu_objset_create_impl(spa,
- NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+ dp->dp_meta_objset = dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
ASSERT3U(err, ==, 0);
+ /* Initialize scan structures */
+ VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
/* create and open the root dir */
dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@@ -267,18 +277,33 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
VERIFY(0 == dsl_pool_open_special_dir(dp,
MOS_DIR_NAME, &dp->dp_mos_dir));
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ /* create and open the free dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /* create and open the free_bplist */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
/* create the root dataset */
- dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
/* create the root objset */
- VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- osip = dmu_objset_create_impl(dp->dp_spa, ds,
+ VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+ os = dmu_objset_create_impl(dp->dp_spa, ds,
dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
#ifdef _KERNEL
- zfs_create_fs(&osip->os, kcred, zplprops, tx);
+ zfs_create_fs(os, kcred, zplprops, tx);
#endif
dsl_dataset_rele(ds, FTAG);
@@ -287,6 +312,14 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
return (dp);
}
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
@@ -295,11 +328,19 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dir_t *dd;
dsl_dataset_t *ds;
dsl_sync_task_group_t *dstg;
- objset_impl_t *mosi = dp->dp_meta_objset->os;
+ objset_t *mos = dp->dp_meta_objset;
hrtime_t start, write_time;
uint64_t data_written;
int err;
+ /*
+ * We need to copy dp_space_towrite() before doing
+ * dsl_sync_task_group_sync(), because
+ * dsl_dataset_snapshot_reserve_space() will increase
+ * dp_space_towrite but not actually write anything.
+ */
+ data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
tx = dmu_tx_create_assigned(dp, txg);
dp->dp_read_overhead = 0;
@@ -325,11 +366,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
for (ds = list_head(&dp->dp_synced_datasets); ds;
ds = list_next(&dp->dp_synced_datasets, ds))
- dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+ dmu_objset_do_userquota_updates(ds->ds_objset, tx);
/*
* Sync the datasets again to push out the changes due to
- * userquota updates. This must be done before we process the
+ * userspace updates. This must be done before we process the
* sync tasks, because that could cause a snapshot of a dataset
* whose ds_bp will be rewritten when we do this 2nd sync.
*/
@@ -341,6 +382,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}
err = zio_wait(zio);
+ /*
+ * Move dead blocks from the pending deadlist to the on-disk
+ * deadlist.
+ */
+ for (ds = list_head(&dp->dp_synced_datasets); ds;
+ ds = list_next(&dp->dp_synced_datasets, ds)) {
+ bplist_iterate(&ds->ds_pending_deadlist,
+ deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+ }
+
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
/*
* No more sync tasks should have been added while we
@@ -356,14 +407,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dir_sync(dd, tx);
write_time += gethrtime() - start;
- if (spa_sync_pass(dp->dp_spa) == 1)
- dsl_pool_scrub_sync(dp, tx);
-
start = gethrtime();
- if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
- list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- dmu_objset_sync(mosi, zio, tx);
+ dmu_objset_sync(mos, zio, tx);
err = zio_wait(zio);
ASSERT(err == 0);
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
@@ -376,7 +424,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dmu_tx_commit(tx);
- data_written = dp->dp_space_towrite[txg & TXG_MASK];
dp->dp_space_towrite[txg & TXG_MASK] = 0;
ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
@@ -401,10 +448,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* amount of write traffic allowed into each transaction group.
* Weight the throughput calculation towards the current value:
* thru = 3/4 old_thru + 1/4 new_thru
+ *
+ * Note: write_time is in nanosecs, so write_time/MICROSEC
+ * yields millisecs
*/
ASSERT(zfs_write_limit_min > 0);
- if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
- uint64_t throughput = (data_written * NANOSEC) / write_time;
+ if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
+ uint64_t throughput = data_written / (write_time / MICROSEC);
+
if (dp->dp_throughput)
dp->dp_throughput = throughput / 4 +
3 * dp->dp_throughput / 4;
@@ -412,21 +463,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dp->dp_throughput = throughput;
dp->dp_write_limit = MIN(zfs_write_limit_inflated,
MAX(zfs_write_limit_min,
- dp->dp_throughput * zfs_txg_synctime));
+ dp->dp_throughput * zfs_txg_synctime_ms));
}
}
void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
{
dsl_dataset_t *ds;
+ objset_t *os;
while (ds = list_head(&dp->dp_synced_datasets)) {
list_remove(&dp->dp_synced_datasets, ds);
- ASSERT(ds->ds_user_ptr != NULL);
- zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ os = ds->ds_objset;
+ zil_clean(os->os_zil, txg);
+ ASSERT(!dmu_objset_is_dirty(os, txg));
dmu_buf_rele(ds->ds_dbuf, ds);
}
+ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}
/*
@@ -627,6 +681,65 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
tx, DS_FIND_CHILDREN));
}
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj) {
+ dsl_dataset_t *origin;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+ dsl_dataset_rele(origin, FTAG);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t obj;
+
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /*
+ * We can't use bpobj_alloc(), because spa_version() still
+ * returns the old version, and we need a new-version bpobj with
+ * subobj support. So call dmu_object_alloc() directly.
+ */
+ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+ SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+
+ VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+}
+
void
dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
{
@@ -641,7 +754,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+ dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
dp, &dp->dp_origin_snap));
dsl_dataset_rele(ds, FTAG);
@@ -653,3 +766,108 @@ dsl_pool_vnrele_taskq(dsl_pool_t *dp)
{
return (dp->dp_vnrele_taskq);
}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+ zap_attribute_t za;
+ zap_cursor_t zc;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+
+ if (zapobj == 0)
+ return;
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ char *htag;
+ uint64_t dsobj;
+
+ htag = strchr(za.za_name, '-');
+ *htag = '\0';
+ ++htag;
+ dsobj = strtonum(za.za_name, NULL);
+ (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(dp->dp_tmp_userrefs_obj == 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
+ DMU_OT_NONE, 0, tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
+ sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ char *name;
+ int error;
+
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * If the pool was created prior to SPA_VERSION_USERREFS, the
+ * zap object for temporary holds might not exist yet.
+ */
+ if (zapobj == 0) {
+ if (holding) {
+ dsl_pool_user_hold_create_obj(dp, tx);
+ zapobj = dp->dp_tmp_userrefs_obj;
+ } else {
+ return (ENOENT);
+ }
+ }
+
+ name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+ if (holding)
+ error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+ else
+ error = zap_remove(mos, zapobj, name, tx);
+ strfree(name);
+
+ return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ uint64_t *now, dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+ tx, B_FALSE));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
index d06493236805..aa66b32e7938 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -19,10 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
@@ -31,14 +31,16 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include "zfs_prop.h"
+#define ZPROP_INHERIT_SUFFIX "$inherit"
+#define ZPROP_RECVD_SUFFIX "$recvd"
+
static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
+dodefault(const char *propname, int intsz, int numints, void *buf)
{
zfs_prop_t prop;
@@ -55,9 +57,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
if (intsz != 1)
return (EOVERFLOW);
(void) strncpy(buf, zfs_prop_default_string(prop),
- numint);
+ numints);
} else {
- if (intsz != 8 || numint < 1)
+ if (intsz != 8 || numints < 1)
return (EOVERFLOW);
*(uint64_t *)buf = zfs_prop_default_numeric(prop);
@@ -68,11 +70,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
int
dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
- int intsz, int numint, void *buf, char *setpoint)
+ int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
{
int err = ENOENT;
+ dsl_dir_t *target = dd;
objset_t *mos = dd->dd_pool->dp_meta_objset;
zfs_prop_t prop;
+ boolean_t inheritable;
+ boolean_t inheriting = B_FALSE;
+ char *inheritstr;
+ char *recvdstr;
ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
@@ -80,51 +87,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
setpoint[0] = '\0';
prop = zfs_name_to_prop(propname);
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
/*
- * Note: dd may be NULL, therefore we shouldn't dereference it
- * ouside this loop.
+ * Note: dd may become NULL, therefore we shouldn't dereference it
+ * after this loop.
*/
for (; dd != NULL; dd = dd->dd_parent) {
ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
- propname, intsz, numint, buf);
+
+ if (dd != target || snapshot) {
+ if (!inheritable)
+ break;
+ inheriting = B_TRUE;
+ }
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ intsz, numints, buf);
if (err != ENOENT) {
- if (setpoint)
+ if (setpoint != NULL && err == 0)
dsl_dir_name(dd, setpoint);
break;
}
/*
- * Break out of this loop for non-inheritable properties.
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
*/
- if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+ inheritstr);
+ if (err != 0 && err != ENOENT)
break;
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ recvdstr, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0) {
+ if (inheriting) {
+ dsl_dir_name(dd, setpoint);
+ } else {
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ }
+ }
+ break;
+ }
+ }
+
+ /*
+ * If we found an explicit inheritance entry, err is zero even
+ * though we haven't yet found the value, so reinitializing err
+ * at the end of the loop (instead of at the beginning) ensures
+ * that err has a valid post-loop value.
+ */
+ err = ENOENT;
}
+
if (err == ENOENT)
- err = dodefault(propname, intsz, numint, buf);
+ err = dodefault(propname, intsz, numints, buf);
+
+ strfree(inheritstr);
+ strfree(recvdstr);
return (err);
}
int
dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
- int intsz, int numint, void *buf, char *setpoint)
+ int intsz, int numints, void *buf, char *setpoint)
{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t inheritable;
+ boolean_t snapshot;
+ uint64_t zapobj;
+
ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
+ zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+
+ if (zapobj != 0) {
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
- if (ds->ds_phys->ds_props_obj) {
- int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+ ASSERT(snapshot);
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
if (err != ENOENT) {
- if (setpoint)
+ if (setpoint != NULL && err == 0)
dsl_dataset_name(ds, setpoint);
return (err);
}
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ if (inheritable) {
+ char *inheritstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, zapobj, inheritstr);
+ strfree(inheritstr);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ char *recvdstr = kmem_asprintf("%s%s", propname,
+ ZPROP_RECVD_SUFFIX);
+ err = zap_lookup(mos, zapobj, recvdstr,
+ intsz, numints, buf);
+ strfree(recvdstr);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ return (err);
+ }
+ }
}
return (dsl_prop_get_dd(ds->ds_dir, propname,
- intsz, numint, buf, setpoint));
+ intsz, numints, buf, setpoint, snapshot));
}
/*
@@ -168,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr->cbr_func(cbr->cbr_arg, value);
- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
- NULL, cbr, &dd));
if (need_rwlock)
rw_exit(&dp->dp_config_rwlock);
- /* Leave dir open until this callback is unregistered */
return (0);
}
@@ -210,6 +298,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname,
return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
}
+void
+dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+ zprop_source_t source, uint64_t *value)
+{
+ psa->psa_name = propname;
+ psa->psa_source = source;
+ psa->psa_intsz = 8;
+ psa->psa_numints = 1;
+ psa->psa_value = value;
+
+ psa->psa_effective_value = -1ULL;
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+ const char *propname = psa->psa_name;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ zprop_source_t source = psa->psa_source;
+ objset_t *mos;
+ uint64_t zapobj;
+ uint64_t version;
+ char *recvdstr;
+ int err = 0;
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ case ZFS_PROP_REFQUOTA:
+ case ZFS_PROP_REFRESERVATION:
+ break;
+ default:
+ return (-1);
+ }
+
+ mos = dd->dd_pool->dp_meta_objset;
+ zapobj = dd->dd_phys->dd_props_zapobj;
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ version = spa_version(dd->dd_pool->dp_spa);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /* Revert to the received value, if any. */
+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = 0;
+ break;
+ case ZPROP_SRC_LOCAL:
+ psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * If there's no local setting, then the new received value will
+ * be the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * We're clearing the received value, so the local setting (if
+ * it exists) remains the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = 0;
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
+ }
+
+ strfree(recvdstr);
+
+ if (err == ENOENT)
+ return (0);
+
+ return (err);
+}
+
+#ifdef ZFS_DEBUG
+void
+dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+ zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
+ uint64_t intval;
+ char setpoint[MAXNAMELEN];
+ uint64_t version = spa_version(dd->dd_pool->dp_spa);
+ int err;
+
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ return;
+ }
+ }
+
+ err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
+ setpoint, B_FALSE);
+ if (err == 0 && intval != psa->psa_effective_value) {
+ cmn_err(CE_PANIC, "%s property, source: %x, "
+ "predicted effective value: %llu, "
+ "actual effective value: %llu (setpoint: %s)",
+ psa->psa_name, psa->psa_source,
+ (unsigned long long)psa->psa_effective_value,
+ (unsigned long long)intval, setpoint);
+ }
+}
+#endif
+
/*
* Unregister this callback. Return 0 on success, ENOENT if ddname is
* invalid, ENOMSG if no matching callback registered.
@@ -241,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
- /* Clean up from dsl_prop_register */
- dsl_dir_close(dd, cbr);
return (0);
}
@@ -277,7 +494,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
zap_cursor_t zc;
zap_attribute_t *za;
int err;
- uint64_t dummyval;
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -289,8 +505,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
* If the prop is set here, then this change is not
* being inherited here or below; stop the recursion.
*/
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
- 8, 1, &dummyval);
+ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
if (err == 0) {
dsl_dir_close(dd, FTAG);
return;
@@ -310,8 +525,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
* If the property is set on this ds, then it is not
* inherited here; don't call the callback.
*/
- if (propobj && 0 == zap_lookup(mos, propobj, propname,
- 8, 1, &dummyval))
+ if (propobj && 0 == zap_contains(mos, propobj, propname))
continue;
cbr->cbr_func(cbr->cbr_arg, value);
@@ -331,30 +545,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
dsl_dir_close(dd, FTAG);
}
-struct prop_set_arg {
- const char *name;
- int intsz;
- int numints;
- const void *buf;
-};
-
-
-static void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+void
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- struct prop_set_arg *psa = arg2;
+ dsl_prop_setarg_t *psa = arg2;
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
- uint64_t zapobj, intval;
+ uint64_t zapobj, intval, dummy;
int isint;
char valbuf[32];
- char *valstr;
+ char *valstr = NULL;
+ char *inheritstr;
+ char *recvdstr;
+ char *tbuf = NULL;
+ int err;
+ uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ const char *propname = psa->psa_name;
+ zprop_source_t source = psa->psa_source;
- isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+ isint = (dodefault(propname, 8, 1, &intval) == 0);
- if (dsl_dataset_is_snapshot(ds)) {
- ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
- SPA_VERSION_SNAP_PROPS);
+ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+ ASSERT(version >= SPA_VERSION_SNAP_PROPS);
if (ds->ds_phys->ds_props_obj == 0) {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_props_obj =
@@ -366,22 +578,97 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
}
- if (psa->numints == 0) {
- int err = zap_remove(mos, zapobj, psa->name, tx);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION)
+ return;
+
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /*
+ * revert to received value, if any (inherit -S)
+ * - remove propname
+ * - remove propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ case ZPROP_SRC_LOCAL:
+ /*
+ * remove propname$inherit
+ * set propname -> value
+ */
+ err = zap_remove(mos, zapobj, inheritstr, tx);
ASSERT(err == 0 || err == ENOENT);
- if (isint) {
- VERIFY(0 == dsl_prop_get_ds(ds,
- psa->name, 8, 1, &intval, NULL));
+ VERIFY(0 == zap_update(mos, zapobj, propname,
+ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+ break;
+ case ZPROP_SRC_INHERITED:
+ /*
+ * explicitly inherit
+ * - remove propname
+ * - set propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (version >= SPA_VERSION_RECVD_PROPS &&
+ dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy,
+ NULL) == 0) {
+ dummy = 0;
+ err = zap_update(mos, zapobj, inheritstr,
+ 8, 1, &dummy, tx);
+ ASSERT(err == 0);
}
- } else {
- VERIFY(0 == zap_update(mos, zapobj, psa->name,
- psa->intsz, psa->numints, psa->buf, tx));
- if (isint)
- intval = *(uint64_t *)psa->buf;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * set propname$recvd -> value
+ */
+ err = zap_update(mos, zapobj, recvdstr,
+ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+ ASSERT(err == 0);
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+ /*
+ * clear local and received settings
+ * - remove propname
+ * - remove propname$inherit
+ * - remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ /* FALLTHRU */
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, recvdstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
}
+ strfree(inheritstr);
+ strfree(recvdstr);
+
if (isint) {
- if (dsl_dataset_is_snapshot(ds)) {
+ VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+
+ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
dsl_prop_cb_record_t *cbr;
/*
* It's a snapshot; nothing can inherit this
@@ -392,58 +679,85 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
if (cbr->cbr_ds == ds &&
- strcmp(cbr->cbr_propname, psa->name) == 0)
+ strcmp(cbr->cbr_propname, propname) == 0)
cbr->cbr_func(cbr->cbr_arg, intval);
}
mutex_exit(&ds->ds_dir->dd_lock);
} else {
dsl_prop_changed_notify(ds->ds_dir->dd_pool,
- ds->ds_dir->dd_object, psa->name, intval, TRUE);
+ ds->ds_dir->dd_object, propname, intval, TRUE);
}
- }
- if (isint) {
+
(void) snprintf(valbuf, sizeof (valbuf),
"%lld", (longlong_t)intval);
valstr = valbuf;
} else {
- valstr = (char *)psa->buf;
+ if (source == ZPROP_SRC_LOCAL) {
+ valstr = (char *)psa->psa_value;
+ } else {
+ tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ if (dsl_prop_get_ds(ds, propname, 1,
+ ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+ valstr = tbuf;
+ }
}
- spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
- LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
- "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
+
+ spa_history_log_internal((source == ZPROP_SRC_NONE ||
+ source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
+ LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx,
+ "%s=%s dataset = %llu", propname,
+ (valstr == NULL ? "" : valstr), ds->ds_object);
+
+ if (tbuf != NULL)
+ kmem_free(tbuf, ZAP_MAXVALUELEN);
}
void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- nvlist_t *nvl = arg2;
+ dsl_props_arg_t *pa = arg2;
+ nvlist_t *props = pa->pa_props;
+ dsl_prop_setarg_t psa;
nvpair_t *elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- struct prop_set_arg psa;
+ psa.psa_source = pa->pa_source;
- psa.name = nvpair_name(elem);
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ nvpair_t *pair = elem;
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- VERIFY(nvpair_value_string(elem,
- (char **)&psa.buf) == 0);
- psa.intsz = 1;
- psa.numints = strlen(psa.buf) + 1;
+ psa.psa_name = nvpair_name(pair);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_STRING) {
+ VERIFY(nvpair_value_string(pair,
+ (char **)&psa.psa_value) == 0);
+ psa.psa_intsz = 1;
+ psa.psa_numints = strlen(psa.psa_value) + 1;
} else {
uint64_t intval;
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
- psa.intsz = sizeof (intval);
- psa.numints = 1;
- psa.buf = &intval;
+ VERIFY(nvpair_value_uint64(pair, &intval) == 0);
+ psa.psa_intsz = sizeof (intval);
+ psa.psa_numints = 1;
+ psa.psa_value = &intval;
}
- dsl_prop_set_sync(ds, &psa, cr, tx);
+ dsl_prop_set_sync(ds, &psa, tx);
}
}
void
-dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx)
+dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
@@ -454,18 +768,19 @@ dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
- spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx,
"%s=%llu dataset = %llu", name, (u_longlong_t)val,
dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_prop_set(const char *dsname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
int intsz, int numints, const void *buf)
{
dsl_dataset_t *ds;
+ uint64_t version;
int err;
- struct prop_set_arg psa;
+ dsl_prop_setarg_t psa;
/*
* We must do these checks before we get to the syncfunc, since
@@ -473,23 +788,30 @@ dsl_prop_set(const char *dsname, const char *propname,
*/
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG);
- if (intsz * numints >= ZAP_MAXVALUELEN)
- return (E2BIG);
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (E2BIG);
+ }
if (dsl_dataset_is_snapshot(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
- psa.name = propname;
- psa.intsz = intsz;
- psa.numints = numints;
- psa.buf = buf;
+ psa.psa_name = propname;
+ psa.psa_source = source;
+ psa.psa_intsz = intsz;
+ psa.psa_numints = numints;
+ psa.psa_value = buf;
+ psa.psa_effective_value = -1ULL;
+
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
NULL, dsl_prop_set_sync, ds, &psa, 2);
@@ -498,158 +820,318 @@ dsl_prop_set(const char *dsname, const char *propname,
}
int
-dsl_props_set(const char *dsname, nvlist_t *nvl)
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
{
dsl_dataset_t *ds;
+ uint64_t version;
nvpair_t *elem = NULL;
+ dsl_props_arg_t pa;
int err;
+ if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+ return (err);
/*
* Do these checks before the syncfunc, since it can't fail.
*/
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN)
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+ dsl_dataset_rele(ds, FTAG);
return (ENAMETOOLONG);
+ }
if (nvpair_type(elem) == DATA_TYPE_STRING) {
char *valstr;
VERIFY(nvpair_value_string(elem, &valstr) == 0);
- if (strlen(valstr) >= ZAP_MAXVALUELEN)
+ if (strlen(valstr) >= (version <
+ SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
return (E2BIG);
+ }
}
}
- if (err = dsl_dataset_hold(dsname, FTAG, &ds))
- return (err);
-
if (dsl_dataset_is_snapshot(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
+ pa.pa_props = props;
+ pa.pa_source = source;
+
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- NULL, dsl_props_set_sync, ds, nvl, 2);
+ NULL, dsl_props_set_sync, ds, &pa, 2);
dsl_dataset_rele(ds, FTAG);
return (err);
}
+typedef enum dsl_prop_getflags {
+ DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
+ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
+ DSL_PROP_GET_LOCAL = 0x4, /* local properties */
+ DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+ const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err = 0;
+
+ for (zap_cursor_init(&zc, mos, propobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ char buf[ZAP_MAXNAMELEN];
+ char *valstr;
+ const char *suffix;
+ const char *propname;
+ const char *source;
+
+ suffix = strchr(za.za_name, '$');
+
+ if (suffix == NULL) {
+ /*
+ * Skip local properties if we only want received
+ * properties.
+ */
+ if (flags & DSL_PROP_GET_RECEIVED)
+ continue;
+
+ propname = za.za_name;
+ source = setpoint;
+ } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+ /* Skip explicitly inherited entries. */
+ continue;
+ } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+ if (flags & DSL_PROP_GET_LOCAL)
+ continue;
+
+ (void) strncpy(buf, za.za_name, (suffix - za.za_name));
+ buf[suffix - za.za_name] = '\0';
+ propname = buf;
+
+ if (!(flags & DSL_PROP_GET_RECEIVED)) {
+ /* Skip if locally overridden. */
+ err = zap_contains(mos, propobj, propname);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+
+ /* Skip if explicitly inherited. */
+ valstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, propobj, valstr);
+ strfree(valstr);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+ }
+
+ source = ((flags & DSL_PROP_GET_INHERITING) ?
+ setpoint : ZPROP_SOURCE_VAL_RECVD);
+ } else {
+ /*
+ * For backward compatibility, skip suffixes we don't
+ * recognize.
+ */
+ continue;
+ }
+
+ prop = zfs_name_to_prop(propname);
+
+ /* Skip non-inheritable properties. */
+ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ continue;
+
+ /* Skip properties not valid for this type. */
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+ continue;
+
+ /* Skip properties already defined. */
+ if (nvlist_exists(nv, propname))
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, propobj,
+ za.za_name, 1, za.za_num_integers, tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
/*
* Iterate over all properties for this dataset and return them in an nvlist.
*/
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+ dsl_prop_getflags_t flags)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
dsl_dir_t *dd = ds->ds_dir;
- boolean_t snapshot = dsl_dataset_is_snapshot(ds);
- int err = 0;
dsl_pool_t *dp = dd->dd_pool;
objset_t *mos = dp->dp_meta_objset;
- uint64_t propobj = ds->ds_phys->ds_props_obj;
+ int err = 0;
+ char setpoint[MAXNAMELEN];
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if (local && snapshot && !propobj)
- return (0);
+ if (dsl_dataset_is_snapshot(ds))
+ flags |= DSL_PROP_GET_SNAPSHOT;
rw_enter(&dp->dp_config_rwlock, RW_READER);
- while (dd != NULL) {
- char setpoint[MAXNAMELEN];
- zap_cursor_t zc;
- zap_attribute_t za;
- dsl_dir_t *dd_next;
-
- if (propobj) {
- dsl_dataset_name(ds, setpoint);
- dd_next = dd;
- } else {
- dsl_dir_name(dd, setpoint);
- propobj = dd->dd_phys->dd_props_zapobj;
- dd_next = dd->dd_parent;
+
+ if (ds->ds_phys->ds_props_obj != 0) {
+ ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+ dsl_dataset_name(ds, setpoint);
+ err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
+ setpoint, flags, *nvp);
+ if (err)
+ goto out;
+ }
+
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+ if (flags & (DSL_PROP_GET_LOCAL |
+ DSL_PROP_GET_RECEIVED))
+ break;
+ flags |= DSL_PROP_GET_INHERITING;
}
+ dsl_dir_name(dd, setpoint);
+ err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
+ setpoint, flags, *nvp);
+ if (err)
+ break;
+ }
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ return (err);
+}
- for (zap_cursor_init(&zc, mos, propobj);
- (err = zap_cursor_retrieve(&zc, &za)) == 0;
- zap_cursor_advance(&zc)) {
- nvlist_t *propval;
- zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+boolean_t
+dsl_prop_get_hasrecvd(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int rc;
+ uint64_t dummy;
- /* Skip non-inheritable properties. */
- if (prop != ZPROP_INVAL &&
- !zfs_prop_inheritable(prop) &&
- (dd != ds->ds_dir || (snapshot && dd != dd_next)))
- continue;
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+ return (rc == 0);
+}
- /* Skip properties not valid for this type. */
- if (snapshot && prop != ZPROP_INVAL &&
- !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
- continue;
+static void
+dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ uint64_t dummy = 0;
+ dsl_prop_setarg_t psa;
- /* Skip properties already defined */
- if (nvlist_lookup_nvlist(*nvp, za.za_name,
- &propval) == 0)
- continue;
+ if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
+ return;
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
- KM_SLEEP) == 0);
- if (za.za_integer_length == 1) {
- /*
- * String property
- */
- char *tmp = kmem_alloc(za.za_num_integers,
- KM_SLEEP);
- err = zap_lookup(mos, propobj,
- za.za_name, 1, za.za_num_integers, tmp);
- if (err != 0) {
- kmem_free(tmp, za.za_num_integers);
- break;
- }
- VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
- tmp) == 0);
- kmem_free(tmp, za.za_num_integers);
- } else {
- /*
- * Integer property
- */
- ASSERT(za.za_integer_length == 8);
- (void) nvlist_add_uint64(propval, ZPROP_VALUE,
- za.za_first_integer);
- }
+ dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
- VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
- setpoint) == 0);
- VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
- propval) == 0);
- nvlist_free(propval);
- }
- zap_cursor_fini(&zc);
+ (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
+ dsl_prop_set_sync, ds, &psa, 2);
+}
- if (err != ENOENT)
- break;
- err = 0;
- /*
- * If we are just after the props that have been set
- * locally, then we are done after the first iteration.
- */
- if (local)
- break;
- dd = dd_next;
- propobj = 0;
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+void
+dsl_prop_set_hasrecvd(objset_t *os)
+{
+ if (dsl_prop_get_hasrecvd(os)) {
+ ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+ return;
}
- rw_exit(&dp->dp_config_rwlock);
+ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+}
- return (err);
+void
+dsl_prop_unset_hasrecvd(objset_t *os)
+{
+ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+{
+ /*
+ * Received properties are not distinguishable from local properties
+ * until the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+ DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
}
void
dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
{
nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+ uint64_t default_value;
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ /* Indicate the default source if we can. */
+ if (dodefault(propname, 8, 1, &default_value) == 0 &&
+ value == default_value) {
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+ }
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
nvlist_free(propval);
}
@@ -657,9 +1139,15 @@ void
dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
{
nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
nvlist_free(propval);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
new file mode 100644
index 000000000000..56d41083673e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -0,0 +1,1766 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scan_cb_t dsl_scan_defrag_cb;
+static scan_cb_t dsl_scan_scrub_cb;
+static scan_cb_t dsl_scan_remove_cb;
+static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
+int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
+int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
+int zfs_scan_idle = 50; /* idle window in clock ticks */
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ scn->scn_restart_txg);
+ }
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+ if (dp->dp_scan) {
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (EBUSY);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+ pool_scan_func_t *funcp = arg2;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_restart_txg = 0;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ } else {
+ spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ dsl_scan_sync_state(scn, tx);
+
+ spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (complete)
+ scn->scn_phys.scn_state = DSS_FINISHED;
+ else
+ scn->scn_phys.scn_state = DSS_CANCELED;
+
+ spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+ "complete=%u", complete);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ spa->spa_scrub_started = B_FALSE;
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+ if (complete) {
+ spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+ ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return (ENOENT);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+ boolean_t complete = B_FALSE;
+ int err;
+
+ err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+ return (err);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp,
+ const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+ zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+int
+dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read(pio, spa, bpp, pbuf, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+int
+dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read_nolock(pio, spa, bpp, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+ return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+ const zbookmark_t *zb2)
+{
+ uint64_t zb1nextL0, zb2thisobj;
+
+ ASSERT(zb1->zb_objset == zb2->zb_objset);
+ ASSERT(zb2->zb_level == 0);
+
+ /*
+ * A bookmark in the deadlist is considered to be after
+ * everything else.
+ */
+ if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+ return (B_TRUE);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ zb1nextL0 = (zb1->zb_blkid + 1) <<
+ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+ zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t nextobj = zb1nextL0 *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ return (nextobj <= zb2thisobj);
+ }
+
+ if (zb1->zb_object < zb2thisobj)
+ return (B_TRUE);
+ if (zb1->zb_object > zb2thisobj)
+ return (B_FALSE);
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ return (B_FALSE);
+ return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (dsl_dataset_is_snapshot(ds))
+ return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+}
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+{
+ uint64_t elapsed_nanosecs;
+ int mintime;
+
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_pausing)
+ return (B_TRUE); /* we're already pausing */
+
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb && zb->zb_level != 0)
+ return (B_FALSE);
+
+ mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > mintime &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa)) {
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ }
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ scn->scn_pausing = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+ uint64_t objset, uint64_t object, uint64_t blkid)
+{
+ zbookmark_t czb;
+ uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+ if (zfs_no_scrub_prefetch)
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ return;
+
+ SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+ /*
+ * XXX need to make sure all of these arc_read() prefetches are
+ * done before setting xlateall (similar to dsl_read())
+ */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_t *zb)
+{
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for pausing
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+ int err;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ zb->zb_object, zb->zb_blkid * epb + i);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ *bufp, ds, scn, ostype, tx);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+ uint32_t flags = ARC_WAIT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ dnode_phys_t *cdnp;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (j = 0; j < cdnp->dn_nblkptr; j++) {
+ blkptr_t *cbp = &cdnp->dn_blkptr[j];
+ dsl_scan_prefetch(scn, *bufp, cbp,
+ zb->zb_objset, zb->zb_blkid * epb + i, j);
+ }
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ }
+
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = (*bufp)->b_data;
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ /*
+ * We also always visit user/group accounting
+ * objects, and never skip them, even if we are
+ * pausing. This is necessary so that the space
+ * deltas from this txg get integrated.
+ */
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode, *bufp,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode, *bufp,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ }
+
+ return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t object, dmu_tx_t *tx)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(&dnp->dn_spill,
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+ dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ arc_buf_t *buf = NULL;
+ blkptr_t bp_toread = *bp;
+
+ /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+ if (dsl_scan_check_pause(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ if (bp->blk_birth == 0)
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ dprintf_bp(bp,
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ ds, ds ? ds->ds_object : 0,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pbuf, bp);
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return;
+
+ if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+ /*
+ * For non-user-accounting blocks, we need to read the
+ * new bp (from a deleted snapshot, found in
+ * check_existing_xlation). If we used the old bp,
+ * pointers inside this block from before we resumed
+ * would be untranslated.
+ *
+ * For user-accounting blocks, we need to read the old
+ * bp, because we will apply the entire space delta to
+ * it (original untranslated -> translations from
+ * deleted snap -> now).
+ */
+ bp_toread = *bp;
+ }
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+ &buf) != 0)
+ return;
+
+ /*
+ * If dsl_scan_ddt() has aready visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ ASSERT(buf == NULL);
+ return;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ }
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+{
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ if (dsl_dataset_is_snapshot(ds)) {
+ /* Note, scn_cur_{min,max}_txg stays the same. */
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else {
+ zfs_dbgmsg("destroying ds %llu; ignoring",
+ (u_longlong_t)ds->ds_object);
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ }
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, &mintxg) == 0) {
+ int err;
+
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ err = zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+ VERIFY(err == 0 || err == EEXIST);
+ if (err == EEXIST) {
+ /* Both were there to begin with */
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, mintxg, tx));
+ }
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+ dmu_tx_t *tx;
+ uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct enqueue_clones_arg *eca = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (dmu_objset_from_ds(ds, &os))
+ goto out;
+
+ /*
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * snapshots can have ZIL block pointers (which may be the same
+ * BP as in the head), they must be ignored. So we traverse the
+ * ZIL here, rather than in scan_recurse(), because the regular
+ * snapshot block-sharing rules don't apply to it.
+ */
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+ dsl_scan_zil(dp, &os->os_zil_header);
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+ char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "pausing=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_pausing);
+ kmem_free(dsname, ZFS_MAXNAMELEN);
+
+ if (scn->scn_pausing)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ goto out;
+ }
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ }
+ if (ds->ds_phys->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (ds->ds_phys->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == ds->ds_phys->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ VERIFY(zap_join_key(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ } else {
+ struct enqueue_clones_arg eca;
+ eca.tx = tx;
+ eca.originobj = ds->ds_object;
+
+ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde = { 0 };
+ int error;
+ uint64_t n = 0;
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_pause(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+ (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_pausing);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_t zb = { 0 };
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_pausing)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_pausing);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ /*
+ * If we were paused, continue from here. Note if the
+ * ds we were paused on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ /*
+ * In case we were paused right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+ /* keep pulling things out of the zap-object-as-queue */
+ while (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj),
+ zap_cursor_retrieve(&zc, &za) == 0) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ dsobj = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+ /* Set up min/max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (za.za_first_integer != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ za.za_first_integer);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ ds->ds_phys->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ zap_cursor_fini(&zc);
+ if (scn->scn_pausing)
+ return;
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+ uint64_t elapsed_nanosecs;
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa))
+ return (ERESTART);
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ return (used != 0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ int err;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, tx->tx_txg);
+ dsl_scan_setup_sync(scn, &func, tx);
+ }
+
+ if (!dsl_scan_active(scn) ||
+ spa_sync_pass(dp->dp_spa) > 1)
+ return;
+
+ scn->scn_visited_this_txg = 0;
+ scn->scn_pausing = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the free list. If we pause the free, don't do
+ * any scanning. This ensures that there is no free list when
+ * we are scanning, so the scan code doesn't have to worry about
+ * traversing it.
+ */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ dsl_scan_free_cb, scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj txg %llu",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ (longlong_t)tx->tx_txg);
+ scn->scn_visited_this_txg = 0;
+ /*
+ * Re-sync the ddt so that we can further modify
+ * it when doing bprewrite.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err == ERESTART)
+ return;
+ }
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ dsl_scan_visit(scn, tx);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("visited %llu blocks in %llums",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+ if (!scn->scn_pausing) {
+ /* finished with scan. */
+ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ }
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_t *zb)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int zio_priority;
+ int scan_delay = 0;
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg)
+ return (0);
+
+ count_block(dp->dp_blkstats, bp);
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ zio_priority = ZIO_PRIORITY_SCRUB;
+ needs_io = B_TRUE;
+ scan_delay = zfs_scrub_delay;
+ } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ zio_flags |= ZIO_FLAG_RESILVER;
+ zio_priority = ZIO_PRIORITY_RESILVER;
+ needs_io = B_FALSE;
+ scan_delay = zfs_resilver_delay;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vdev_t *vd = vdev_lookup_top(spa,
+ DVA_GET_VDEV(&bp->blk_dva[d]));
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io) {
+ if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ needs_io = B_TRUE;
+ } else {
+ needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+ phys_birth, 1);
+ }
+ }
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If we're seeing recent (zfs_scan_idle) "important" I/Os
+ * then throttle our workload to limit the impact of a scan.
+ */
+ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+ delay(scan_delay);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ dsl_scan_scrub_done, NULL, zio_priority,
+ zio_flags, zb));
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+ dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
deleted file mode 100644
index 50cc069a3a78..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
+++ /dev/null
@@ -1,1060 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
-int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
- NULL,
- dsl_pool_scrub_clean_cb
-};
-
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- enum scrub_func *funcp = arg2;
- dmu_object_type_t ot = 0;
- boolean_t complete = B_FALSE;
-
- dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
- ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
- ASSERT(*funcp > SCRUB_FUNC_NONE);
- ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
- dp->dp_scrub_min_txg = 0;
- dp->dp_scrub_max_txg = tx->tx_txg;
-
- if (*funcp == SCRUB_FUNC_CLEAN) {
- vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
- /* rewrite all disk labels */
- vdev_config_dirty(rvd);
-
- if (vdev_resilver_needed(rvd,
- &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_RESILVER_START);
- dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
- tx->tx_txg);
- } else {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_SCRUB_START);
- }
-
- /* zero out the scrub stats in all vdev_stat_t's */
- vdev_scrub_stat_update(rvd,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
-
- dp->dp_spa->spa_scrub_started = B_TRUE;
- }
-
- /* back to the generic stuff */
-
- if (dp->dp_blkstats == NULL) {
- dp->dp_blkstats =
- kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
- }
- bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
- if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
- ot = DMU_OT_ZAP_OTHER;
-
- dp->dp_scrub_func = *funcp;
- dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
- ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
- dp->dp_scrub_restart = B_FALSE;
- dp->dp_spa->spa_scrub_errors = 0;
-
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &dp->dp_spa->spa_scrub_errors, tx));
-
- spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
- "func=%u mintxg=%llu maxtxg=%llu",
- *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- boolean_t *completep = arg2;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- mutex_enter(&dp->dp_scrub_cancel_lock);
-
- if (dp->dp_scrub_restart) {
- dp->dp_scrub_restart = B_FALSE;
- *completep = B_FALSE;
- }
-
- /* XXX this is scrub-clean specific */
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- while (dp->dp_spa->spa_scrub_inflight > 0) {
- cv_wait(&dp->dp_spa->spa_scrub_io_cv,
- &dp->dp_spa->spa_scrub_lock);
- }
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_started = B_FALSE;
- dp->dp_spa->spa_scrub_active = B_FALSE;
-
- dp->dp_scrub_func = SCRUB_FUNC_NONE;
- VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, tx));
- dp->dp_scrub_queue_obj = 0;
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, tx));
-
- spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
- "complete=%u", *completep);
-
- /* below is scrub-clean specific */
- vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
- *completep);
- /*
- * If the scrub/resilver completed, update all DTLs to reflect this.
- * Whether it succeeded or not, vacate all temporary scrub DTLs.
- */
- vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
- *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
- if (*completep)
- spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
- ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
- spa_errlog_rotate(dp->dp_spa);
-
- /*
- * We may have finished replacing a device.
- * Let the async thread assess this and handle the detach.
- */
- spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
- dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
- boolean_t complete = B_FALSE;
-
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- /*
- * This function will be used by bp-rewrite wad to intercept frees.
- */
- return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
- done, private, arc_flags));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
- return (zb->zb_objset == 0 && zb->zb_object == 0 &&
- zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
-
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb1->zb_object != -1ULL);
- ASSERT(zb2->zb_level == 0);
-
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == -1ULL)
- return (B_TRUE);
-
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
-
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
- if (zb1->zb_object == 0) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
- }
-
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == 0)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
-{
- int elapsed_ticks;
- int mintime;
-
- if (dp->dp_scrub_pausing)
- return (B_TRUE); /* we're already pausing */
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
- return (B_FALSE); /* we're resuming */
-
- /* We only know how to resume from level-0 blocks. */
- if (zb->zb_level != 0)
- return (B_FALSE);
-
- mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
- zfs_scrub_min_time;
- elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
- if (elapsed_ticks > hz * zfs_txg_timeout ||
- (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
- dprintf("pausing at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
- dp->dp_scrub_pausing = B_TRUE;
- dp->dp_scrub_bookmark = *zb;
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
- dsl_pool_t *zta_dp;
- zil_header_t *zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static void
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- /*
- * One block ("stubby") can be allocated a long time ago; we
- * want to visit that one because it has been allocated
- * (on-disk) even if it hasn't been claimed (even though for
- * plain scrub there's nothing to do to it).
- */
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
- return;
-
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
- if (lrc->lrc_txtype == TX_WRITE) {
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- /*
- * birth can be < claim_txg if this record's txg is
- * already txg sync'ed (but this log block contains
- * other records that are not synced)
- */
- if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
-
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
- }
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
- uint64_t claim_txg = zh->zh_claim_txg;
- zil_traverse_arg_t zta = { dp, zh };
- zilog_t *zilog;
-
- /*
- * We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
- */
- if (claim_txg == 0 && spa_writeable(dp->dp_spa))
- return;
-
- zilog = zil_alloc(dp->dp_meta_objset, zh);
-
- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
- claim_txg);
-
- zil_free(zilog);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
- arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
- int err;
- arc_buf_t *buf = NULL;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- if (scrub_pause(dp, zb))
- return;
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
- /*
- * If we already visited this bp & everything below (in
- * a prior txg), don't bother doing it again.
- */
- if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
- return;
-
- /*
- * If we found the block we're trying to resume from, or
- * we went past it to a different object, zero it out to
- * indicate that it's OK to start checking for pausing
- * again.
- */
- if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
- zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
- dprintf("resuming at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset,
- (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level,
- (longlong_t)zb->zb_blkid);
- bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
- }
- }
-
- if (BP_GET_LEVEL(bp) > 0) {
- uint32_t flags = ARC_WAIT;
- int i;
- blkptr_t *cbp;
- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- cbp = buf->b_data;
-
- for (i = 0; i < epb; i++, cbp++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
- zb->zb_level - 1,
- zb->zb_blkid * epb + i);
- scrub_visitbp(dp, dnp, buf, cbp, &czb);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
- uint32_t flags = ARC_WAIT;
- dnode_phys_t *child_dnp;
- int i;
- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- child_dnp = buf->b_data;
-
- for (i = 0; i < epb; i++, child_dnp++) {
- scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
- zb->zb_blkid * epb + i);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- uint32_t flags = ARC_WAIT;
- objset_phys_t *osp;
-
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
-
- osp = buf->b_data;
-
- traverse_zil(dp, &osp->os_zil_header);
-
- scrub_visitdnode(dp, &osp->os_meta_dnode,
- buf, zb->zb_objset, 0);
- if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- scrub_visitdnode(dp, &osp->os_userused_dnode,
- buf, zb->zb_objset, 0);
- scrub_visitdnode(dp, &osp->os_groupused_dnode,
- buf, zb->zb_objset, 0);
- }
- }
-
- (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
- if (buf)
- (void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object)
-{
- int j;
-
- for (j = 0; j < dnp->dn_nblkptr; j++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
- scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
- }
-
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
- zbookmark_t zb;
-
- SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
- scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) != 0) {
- return;
- }
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- dp->dp_scrub_bookmark.zb_objset =
- ds->ds_phys->ds_prev_snap_obj;
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_prev_snap_obj, tx) == 0);
- }
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
- } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
- }
-
- if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds1->ds_object, tx) == 0) {
- int err = zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds2->ds_object, tx);
- VERIFY(err == 0 || err == EEXIST);
- if (err == EEXIST) {
- /* Both were there to begin with */
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds2->ds_object, tx) == 0) {
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
-}
-
-struct enqueue_clones_arg {
- dmu_tx_t *tx;
- uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- struct enqueue_clones_arg *eca = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
- dp = ds->ds_dir->dd_pool;
-
- if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
- dsl_dataset_rele(ds, FTAG);
- if (err)
- return (err);
- ds = prev;
- }
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, eca->tx) == 0);
- }
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
- uint64_t min_txg_save;
-
- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
- /*
- * Iterate over the bps in this ds.
- */
- min_txg_save = dp->dp_scrub_min_txg;
- dp->dp_scrub_min_txg =
- MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
- scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
- dp->dp_scrub_min_txg = min_txg_save;
-
- if (dp->dp_scrub_pausing)
- goto out;
-
- /*
- * Add descendent datasets to work queue.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- if (ds->ds_phys->ds_num_children > 1) {
- boolean_t usenext = B_FALSE;
- if (ds->ds_phys->ds_next_clones_obj != 0) {
- uint64_t count;
- /*
- * A bug in a previous version of the code could
- * cause upgrade_clones_cb() to not set
- * ds_next_snap_obj when it should, leading to a
- * missing entry. Therefore we can only use the
- * next_clones_obj when its count is correct.
- */
- int err = zap_count(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj, &count);
- if (err == 0 &&
- count == ds->ds_phys->ds_num_children - 1)
- usenext = B_TRUE;
- }
-
- if (usenext) {
- VERIFY(zap_join(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj,
- dp->dp_scrub_queue_obj, tx) == 0);
- } else {
- struct enqueue_clones_arg eca;
- eca.tx = tx;
- eca.originobj = ds->ds_object;
-
- (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
- NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
- }
- }
-
-out:
- dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- dmu_tx_t *tx = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
-
- dp = ds->ds_dir->dd_pool;
-
- while (ds->ds_phys->ds_prev_snap_obj != 0) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- FTAG, &prev);
- if (err) {
- dsl_dataset_rele(ds, FTAG);
- return (err);
- }
-
- /*
- * If this is a clone, we don't need to worry about it for now.
- */
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_rele(ds, FTAG);
- dsl_dataset_rele(prev, FTAG);
- return (0);
- }
- dsl_dataset_rele(ds, FTAG);
- ds = prev;
- }
-
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0);
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
- spa_t *spa = dp->dp_spa;
- zap_cursor_t zc;
- zap_attribute_t za;
- boolean_t complete = B_TRUE;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- /*
- * If the pool is not loaded, or is trying to unload, leave it alone.
- */
- if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
- return;
-
- if (dp->dp_scrub_restart) {
- enum scrub_func func = dp->dp_scrub_func;
- dp->dp_scrub_restart = B_FALSE;
- dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
- }
-
- if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
- /*
- * We must have resumed after rebooting; reset the vdev
- * stats to know that we're doing a scrub (although it
- * will think we're just starting now).
- */
- vdev_scrub_stat_update(spa->spa_root_vdev,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
- }
-
- dp->dp_scrub_pausing = B_FALSE;
- dp->dp_scrub_start_time = lbolt64;
- dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
- spa->spa_scrub_active = B_TRUE;
-
- if (dp->dp_scrub_bookmark.zb_objset == 0) {
- /* First do the MOS & ORIGIN */
- scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
- if (dp->dp_scrub_pausing)
- goto out;
-
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- VERIFY(0 == dmu_objset_find_spa(spa,
- NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
- } else {
- scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
- }
- ASSERT(!dp->dp_scrub_pausing);
- } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
- /*
- * If we were paused, continue from here. Note if the
- * ds we were paused on was deleted, the zb_objset will
- * be -1, so we will skip this and find a new objset
- * below.
- */
- scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
- if (dp->dp_scrub_pausing)
- goto out;
- }
-
- /*
- * In case we were paused right at the end of the ds, zero the
- * bookmark so we don't think that we're still trying to resume.
- */
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
- /* keep pulling things out of the zap-object-as-queue */
- while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
- zap_cursor_retrieve(&zc, &za) == 0) {
- VERIFY(0 == zap_remove(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, za.za_name, tx));
- scrub_visitds(dp, za.za_first_integer, tx);
- if (dp->dp_scrub_pausing)
- break;
- zap_cursor_fini(&zc);
- }
- zap_cursor_fini(&zc);
- if (dp->dp_scrub_pausing)
- goto out;
-
- /* done. */
-
- dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
- return;
-out:
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors, tx));
-
- /* XXX this is scrub-clean specific */
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > 0)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
- mutex_enter(&dp->dp_scrub_cancel_lock);
- dp->dp_scrub_restart = B_TRUE;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
- int i;
-
- /*
- * If we resume after a reboot, zab will be NULL; don't record
- * incomplete stats in that case.
- */
- if (zab == NULL)
- return;
-
- for (i = 0; i < 4; i++) {
- int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
- int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
- zfs_blkstat_t *zb = &zab->zab_type[l][t];
- int equal;
-
- zb->zb_count++;
- zb->zb_asize += BP_GET_ASIZE(bp);
- zb->zb_lsize += BP_GET_LSIZE(bp);
- zb->zb_psize += BP_GET_PSIZE(bp);
- zb->zb_gangs += BP_COUNT_GANG(bp);
-
- switch (BP_GET_NDVAS(bp)) {
- case 2:
- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1]))
- zb->zb_ditto_2_of_2_samevdev++;
- break;
- case 3:
- equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1])) +
- (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[2])) +
- (DVA_GET_VDEV(&bp->blk_dva[1]) ==
- DVA_GET_VDEV(&bp->blk_dva[2]));
- if (equal == 1)
- zb->zb_ditto_2_of_3_samevdev++;
- else if (equal == 3)
- zb->zb_ditto_3_of_3_samevdev++;
- break;
- }
- }
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
- spa_t *spa = zio->io_spa;
-
- zio_data_buf_free(zio->io_data, zio->io_size);
-
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_inflight--;
- cv_broadcast(&spa->spa_scrub_io_cv);
-
- if (zio->io_error && (zio->io_error != ECKSUM ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
- spa->spa_scrub_errors++;
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
- const blkptr_t *bp, const zbookmark_t *zb)
-{
- size_t size = BP_GET_PSIZE(bp);
- spa_t *spa = dp->dp_spa;
- boolean_t needs_io;
- int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- int zio_priority;
-
- ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
-
- if (bp->blk_birth >= dp->dp_scrub_max_txg)
- return (0);
-
- count_block(dp->dp_blkstats, bp);
-
- if (dp->dp_scrub_isresilver == 0) {
- /* It's a scrub */
- zio_flags |= ZIO_FLAG_SCRUB;
- zio_priority = ZIO_PRIORITY_SCRUB;
- needs_io = B_TRUE;
- } else {
- /* It's a resilver */
- zio_flags |= ZIO_FLAG_RESILVER;
- zio_priority = ZIO_PRIORITY_RESILVER;
- needs_io = B_FALSE;
- }
-
- /* If it's an intent log block, failure is expected. */
- if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
- zio_flags |= ZIO_FLAG_SPECULATIVE;
-
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
- vdev_t *vd = vdev_lookup_top(spa,
- DVA_GET_VDEV(&bp->blk_dva[d]));
-
- /*
- * Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
- */
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_examined +=
- DVA_GET_ASIZE(&bp->blk_dva[d]);
- mutex_exit(&vd->vdev_stat_lock);
-
- /* if it's a resilver, this may not be in the target range */
- if (!needs_io) {
- if (DVA_GET_GANG(&bp->blk_dva[d])) {
- /*
- * Gang members may be spread across multiple
- * vdevs, so the best estimate we have is the
- * scrub range, which has already been checked.
- * XXX -- it would be better to change our
- * allocation policy to ensure that all
- * gang members reside on the same vdev.
- */
- needs_io = B_TRUE;
- } else {
- needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
- bp->blk_birth, 1);
- }
- }
- }
-
- if (needs_io && !zfs_no_scrub_io) {
- void *data = zio_data_buf_alloc(size);
-
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_inflight++;
- mutex_exit(&spa->spa_scrub_lock);
-
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_pool_scrub_clean_done, NULL, zio_priority,
- zio_flags, zb));
- }
-
- /* do not relocate this block */
- return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
- spa_t *spa = dp->dp_spa;
-
- /*
- * Purge all vdev caches. We do this here rather than in sync
- * context because this requires a writer lock on the spa_config
- * lock, which we can't do from sync context. The
- * spa_scrub_reopen flag indicates that vdev_open() should not
- * attempt to start another scrub.
- */
- spa_vdev_state_enter(spa);
- spa->spa_scrub_reopen = B_TRUE;
- vdev_reopen(spa->spa_root_vdev);
- spa->spa_scrub_reopen = B_FALSE;
- (void) spa_vdev_state_exit(spa, NULL, 0);
-
- return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
index 21100225abf7..b0818ce274d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -19,18 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
-#include <sys/cred.h>
+#include <sys/metaslab.h>
#define DST_AVG_BLKSHIFT 14
@@ -50,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
offsetof(dsl_sync_task_t, dst_node));
dstg->dstg_pool = dp;
- dstg->dstg_cr = CRED();
return (dstg);
}
@@ -112,14 +108,21 @@ top:
return (dstg->dstg_err);
}
- VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+ /*
+ * We don't generally have many sync tasks, so pay the price of
+ * add_tail to get the tasks executed in the right order.
+ */
+ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+ dstg, txg));
dmu_tx_commit(tx);
txg_wait_synced(dstg->dstg_pool, txg);
- if (dstg->dstg_err == EAGAIN)
+ if (dstg->dstg_err == EAGAIN) {
+ txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
goto top;
+ }
return (dstg->dstg_err);
}
@@ -131,7 +134,12 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
dstg->dstg_nowaiter = B_TRUE;
txg = dmu_tx_get_txg(tx);
- VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+ /*
+ * We don't generally have many sync tasks, so pay the price of
+ * add_tail to get the tasks executed in the right order.
+ */
+ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+ dstg, txg));
}
void
@@ -150,25 +158,30 @@ void
dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
{
dsl_sync_task_t *dst;
- void *tr_cookie;
+ dsl_pool_t *dp = dstg->dstg_pool;
+ uint64_t quota, used;
ASSERT3U(dstg->dstg_err, ==, 0);
/*
- * Check for sufficient space.
+ * Check for sufficient space. We just check against what's
+ * on-disk; we don't want any in-flight accounting to get in our
+ * way, because open context may have already used up various
+ * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
*/
- dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
- dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
- /* don't bother trying again */
- if (dstg->dstg_err == ERESTART)
- dstg->dstg_err = EAGAIN;
- if (dstg->dstg_err)
+ quota = dsl_pool_adjustedsize(dp, B_FALSE) -
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ used = dp->dp_root_dir->dd_phys->dd_used_bytes;
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) {
+ dstg->dstg_err = ENOSPC;
return;
+ }
/*
* Check for errors by calling checkfuncs.
*/
- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
dst->dst_err =
@@ -183,13 +196,10 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
*/
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
- dstg->dstg_cr, tx);
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
}
}
- rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
- dsl_dir_tempreserve_clear(tr_cookie, tx);
+ rw_exit(&dp->dp_config_rwlock);
if (dstg->dstg_nowaiter)
dsl_sync_task_group_destroy(dstg);
@@ -203,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp,
dsl_sync_task_group_t *dstg;
int err;
+ ASSERT(spa_writeable(dp->dp_spa));
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
@@ -218,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp,
{
dsl_sync_task_group_t *dstg;
+ if (!spa_writeable(dp->dp_spa))
+ return;
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
index a88b85c7ec25..a2d9dab88440 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
@@ -20,21 +20,20 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
- * We keep our own copy of this algorithm for 2 main reasons:
- * 1. If we didn't, anyone modifying common/os/compress.c would
+ * We keep our own copy of this algorithm for 3 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
* directly break our on disk format
- * 2. Our version of lzjb does not have a number of checks that the
+ * 2. Our version of lzjb does not have a number of checks that the
* common/os version needs and uses
+ * 3. We initialize the lempel to ensure deterministic results,
+ * so that identical blocks can always be deduplicated.
* In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
*/
#include <sys/zfs_context.h>
@@ -44,7 +43,7 @@
#define MATCH_MIN 3
#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
-#define LEMPEL_SIZE 256
+#define LEMPEL_SIZE 1024
/*ARGSUSED*/
size_t
@@ -54,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
uchar_t *dst = d_start;
uchar_t *cpy, *copymap;
int copymask = 1 << (NBBY - 1);
- int mlen, offset;
+ int mlen, offset, hash;
uint16_t *hp;
- uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+ uint16_t lempel[LEMPEL_SIZE] = { 0 };
while (src < (uchar_t *)s_start + s_len) {
if ((copymask <<= 1) == (1 << NBBY)) {
- if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
- if (d_len != s_len)
- return (s_len);
- mlen = s_len;
- for (src = s_start, dst = d_start; mlen; mlen--)
- *dst++ = *src++;
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
return (s_len);
- }
copymask = 1;
copymap = dst;
*dst++ = 0;
@@ -76,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
*dst++ = *src++;
continue;
}
- hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
- (LEMPEL_SIZE - 1)];
+ hash = (src[0] << 16) + (src[1] << 8) + src[2];
+ hash += hash >> 9;
+ hash += hash >> 5;
+ hp = &lempel[hash & (LEMPEL_SIZE - 1)];
offset = (intptr_t)(src - *hp) & OFFSET_MASK;
*hp = (uint16_t)(uintptr_t)src;
cpy = src - offset;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index c5ce27cb677c..17b4b12c4ee4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -23,7 +23,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/space_map.h>
@@ -35,6 +34,11 @@ uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
@@ -72,12 +76,13 @@ int metaslab_smo_bonus_pct = 150;
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+ mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
@@ -87,58 +92,73 @@ metaslab_class_create(space_map_ops_t *ops)
void
metaslab_class_destroy(metaslab_class_t *mc)
{
- metaslab_group_t *mg;
-
- while ((mg = mc->mc_rotor) != NULL) {
- metaslab_class_remove(mc, mg);
- metaslab_group_destroy(mg);
- }
+ ASSERT(mc->mc_rotor == NULL);
+ ASSERT(mc->mc_alloc == 0);
+ ASSERT(mc->mc_deferred == 0);
+ ASSERT(mc->mc_space == 0);
+ ASSERT(mc->mc_dspace == 0);
kmem_free(mc, sizeof (metaslab_class_t));
}
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+int
+metaslab_class_validate(metaslab_class_t *mc)
{
- metaslab_group_t *mgprev, *mgnext;
+ metaslab_group_t *mg;
+ vdev_t *vd;
- ASSERT(mg->mg_class == NULL);
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
- if ((mgprev = mc->mc_rotor) == NULL) {
- mg->mg_prev = mg;
- mg->mg_next = mg;
- } else {
- mgnext = mgprev->mg_next;
- mg->mg_prev = mgprev;
- mg->mg_next = mgnext;
- mgprev->mg_next = mg;
- mgnext->mg_prev = mg;
- }
- mc->mc_rotor = mg;
- mg->mg_class = mc;
+ if ((mg = mc->mc_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ return (0);
}
void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
- metaslab_group_t *mgprev, *mgnext;
+ atomic_add_64(&mc->mc_alloc, alloc_delta);
+ atomic_add_64(&mc->mc_deferred, defer_delta);
+ atomic_add_64(&mc->mc_space, space_delta);
+ atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
- ASSERT(mg->mg_class == mc);
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+ return (mc->mc_alloc);
+}
- mgprev = mg->mg_prev;
- mgnext = mg->mg_next;
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+ return (mc->mc_deferred);
+}
- if (mg == mgnext) {
- mc->mc_rotor = NULL;
- } else {
- mc->mc_rotor = mgnext;
- mgprev->mg_next = mgnext;
- mgnext->mg_prev = mgprev;
- }
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+ return (mc->mc_space);
+}
- mg->mg_prev = NULL;
- mg->mg_next = NULL;
- mg->mg_class = NULL;
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
}
/*
@@ -179,9 +199,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
- mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
mg->mg_vd = vd;
- metaslab_class_add(mc, mg);
+ mg->mg_class = mc;
+ mg->mg_activation_count = 0;
return (mg);
}
@@ -189,11 +209,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
void
metaslab_group_destroy(metaslab_group_t *mg)
{
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ /*
+ * We may have gone below zero with the activation count
+ * either because we never activated in the first place or
+ * because we're done, and possibly removing the vdev.
+ */
+ ASSERT(mg->mg_activation_count <= 0);
+
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
kmem_free(mg, sizeof (metaslab_group_t));
}
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count <= 0);
+
+ if (++mg->mg_activation_count <= 0)
+ return;
+
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+}
+
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+ if (--mg->mg_activation_count != 0) {
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count < 0);
+ return;
+ }
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+}
+
static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{
@@ -611,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
metaslab_group_add(mg, msp);
+ if (metaslab_debug && smo->smo_object != 0) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+ SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+ mutex_exit(&msp->ms_lock);
+ }
+
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
@@ -621,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
metaslab_sync_done(msp, 0);
if (txg != 0) {
- /*
- * The vdev is dirty, but the metaslab isn't -- it just needs
- * to have metaslab_sync_done() invoked from vdev_sync_done().
- * [We could just dirty the metaslab, but that would cause us
- * to allocate a space map object for it, which is wasteful
- * and would mess up the locality logic in metaslab_weight().]
- */
- ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
vdev_dirty(vd, 0, NULL, txg);
- vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
}
return (msp);
@@ -640,10 +730,9 @@ void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
- int t;
- vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc, B_TRUE);
+ vdev_space_update(mg->mg_vd,
+ -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
metaslab_group_remove(mg, msp);
@@ -652,11 +741,16 @@ metaslab_fini(metaslab_t *msp)
space_map_unload(&msp->ms_map);
space_map_destroy(&msp->ms_map);
- for (t = 0; t < TXG_SIZE; t++) {
+ for (int t = 0; t < TXG_SIZE; t++) {
space_map_destroy(&msp->ms_allocmap[t]);
space_map_destroy(&msp->ms_freemap[t]);
}
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_destroy(&msp->ms_defermap[t]);
+
+ ASSERT3S(msp->ms_deferspace, ==, 0);
+
mutex_exit(&msp->ms_lock);
mutex_destroy(&msp->ms_lock);
@@ -741,7 +835,7 @@ metaslab_prefetch(metaslab_group_t *mg)
if (!sm->sm_loaded && smo->smo_object != 0) {
mutex_exit(&mg->mg_lock);
- dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
+ dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
0ULL, smo->smo_objsize);
mutex_enter(&mg->mg_lock);
}
@@ -759,11 +853,19 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
- msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
- if (error) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
+ space_map_load_wait(sm);
+ if (!sm->sm_loaded) {
+ int error = space_map_load(sm, sm_ops, SM_FREE,
+ &msp->ms_smo,
+ spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_claim, sm);
+
}
/*
@@ -812,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
{
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
+ objset_t *mos = spa_meta_objset(spa);
space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -820,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_obj_t *smo = &msp->ms_smo_syncing;
dmu_buf_t *db;
dmu_tx_t *tx;
- int t;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ ASSERT(!vd->vdev_ishole);
+
+ if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+ return;
/*
* The only state that can actually be changing concurrently with
@@ -832,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* We drop it whenever we call into the DMU, because the DMU
* can call down to us (e.g. via zio_free()) at any time.
*/
- mutex_enter(&msp->ms_lock);
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (smo->smo_object == 0) {
ASSERT(smo->smo_objsize == 0);
ASSERT(smo->smo_alloc == 0);
- mutex_exit(&msp->ms_lock);
smo->smo_object = dmu_object_alloc(mos,
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
@@ -845,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
(sm->sm_start >> vd->vdev_ms_shift),
sizeof (uint64_t), &smo->smo_object, tx);
- mutex_enter(&msp->ms_lock);
}
+ mutex_enter(&msp->ms_lock);
+
space_map_walk(freemap, space_map_add, freed_map);
if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
@@ -860,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* This metaslab is 100% allocated,
* minus the content of the in-core map (sm),
* minus what's been freed this txg (freed_map),
+ * minus deferred frees (ms_defermap[]),
* minus allocations from txgs in the future
* (because they haven't been committed yet).
*/
@@ -871,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_walk(sm, space_map_remove, allocmap);
space_map_walk(freed_map, space_map_remove, allocmap);
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_remove, allocmap);
+
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
space_map_remove, allocmap);
@@ -905,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
space_map_obj_t *smosync = &msp->ms_smo_syncing;
space_map_t *sm = &msp->ms_map;
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
- int t;
+ int64_t alloc_delta, defer_delta;
+
+ ASSERT(!vd->vdev_ishole);
mutex_enter(&msp->ms_lock);
@@ -916,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* allocmaps and freemaps and add its capacity to the vdev.
*/
if (freed_map->sm_size == 0) {
- for (t = 0; t < TXG_SIZE; t++) {
+ for (int t = 0; t < TXG_SIZE; t++) {
space_map_create(&msp->ms_allocmap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_create(&msp->ms_defermap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+ vdev_space_update(vd, 0, 0, sm->sm_size);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+ alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+ defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+ vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -933,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
/*
* If there's a space_map_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
- * Then, add everything we freed in this txg to the map.
+ * Then, add defer_map (oldest deferred frees) to this map and
+ * transfer freed_map (this txg's frees) to defer_map.
*/
space_map_load_wait(sm);
- space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+ space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+ space_map_vacate(freed_map, space_map_add, defer_map);
*smo = *smosync;
+ msp->ms_deferspace += defer_delta;
+ ASSERT3S(msp->ms_deferspace, >=, 0);
+ ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+ if (msp->ms_deferspace != 0) {
+ /*
+ * Keep syncing this metaslab until all deferred frees
+ * are back in circulation.
+ */
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ }
+
/*
* If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then
@@ -948,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int evictable = 1;
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
evictable = 0;
- if (evictable)
+ if (evictable && !metaslab_debug)
space_map_unload(sm);
}
@@ -1119,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
- if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
+ if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
- * Note that there's no locking on mc_rotor or mc_allocated because
+ * Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
@@ -1146,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (flags & METASLAB_HINTBP_AVOID)
- mg = vd->vdev_mg->mg_next;
- else
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists (i.e. removed). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL) {
mg = vd->vdev_mg;
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
@@ -1158,15 +1303,18 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
}
/*
- * If the hint put us into the wrong class, just follow the rotor.
+ * If the hint put us into the wrong metaslab class, or into a
+ * metaslab group that has been passivated, just follow the rotor.
*/
- if (mg->mg_class != mc)
+ if (mg->mg_class != mc || mg->mg_activation_count <= 0)
mg = mc->mc_rotor;
rotor = mg;
top:
all_zero = B_TRUE;
do {
+ ASSERT(mg->mg_activation_count == 1);
+
vd = mg->mg_vd;
/*
@@ -1211,32 +1359,28 @@ top:
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
- if (mc->mc_allocated == 0) {
+ if (mc->mc_aliquot == 0) {
vdev_stat_t *vs = &vd->vdev_stat;
- uint64_t alloc, space;
- int64_t vu, su;
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
+ int64_t vu, cu;
/*
* Determine percent used in units of 0..1024.
* (This is just to avoid floating point.)
*/
vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
- su = (alloc << 10) / (space + 1);
+ cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
/*
* Bias by at most +/- 25% of the aliquot.
*/
- mg->mg_bias = ((su - vu) *
+ mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / (1024 * 4);
}
- if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
}
DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -1248,7 +1392,7 @@ top:
}
next:
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
if (!all_zero) {
@@ -1328,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
uint64_t size = DVA_GET_ASIZE(dva);
vdev_t *vd;
metaslab_t *msp;
- int error;
+ int error = 0;
ASSERT(DVA_IS_VALID(dva));
@@ -1343,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+ if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+ error = ENOENT;
+
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
@@ -1371,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int error = 0;
ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -1400,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
spa_config_exit(spa, SCL_ALLOC, FTAG);
- bp->blk_birth = txg;
+ BP_SET_BIRTH(bp, txg, txg);
return (0);
}
@@ -1412,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
index 5fe4e638055a..6d8e2f221425 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -19,16 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/refcount.h>
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
#ifdef _KERNEL
int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -192,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder)
return (refcount_remove_many(rc, 1, holder));
}
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+#endif /* ZFS_DEBUG */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
new file mode 100644
index 000000000000..4db13fd917c3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -0,0 +1,1970 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode. The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations. The spill block will be sized to fit the data
+ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer. Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function. The names are only used
+ * during registration. All attributes are known by their unique attribute
+ * id value. If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually. Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout. The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill). If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used. This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues. The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes. Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once. sa_replace_all_by_template() is
+ * used to set an array of attributes. This is used by the ZPL when
+ * creating a brand new file. The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing. The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping. Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available. Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+ uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+ void *data);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+ int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+ byteswap_uint64_array,
+ byteswap_uint32_array,
+ byteswap_uint16_array,
+ byteswap_uint8_array,
+ zfs_acl_byteswap,
+};
+
+#define SA_COPY_DATA(f, s, t, l) \
+ { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else \
+ sa_copy_data(f, s, t, l); \
+ }
+
+/*
+ * This table is fixed and cannot be changed. Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes. These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * ZPL legacy layout
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+ sa_handle_t *hdl = buf;
+
+ hdl->sa_bonus_tab = NULL;
+ hdl->sa_spill_tab = NULL;
+ hdl->sa_os = NULL;
+ hdl->sa_userp = NULL;
+ hdl->sa_bonus = NULL;
+ hdl->sa_spill = NULL;
+ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+ sa_handle_t *hdl = buf;
+ mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+ sa_cache = kmem_cache_create("sa_cache",
+ sizeof (sa_handle_t), 0, sa_cache_constructor,
+ sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+ if (sa_cache)
+ kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_num > node2->lot_num)
+ return (1);
+ else if (node1->lot_num < node2->lot_num)
+ return (-1);
+ return (0);
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_hash > node2->lot_hash)
+ return (1);
+ if (node1->lot_hash < node2->lot_hash)
+ return (-1);
+ if (node1->lot_instance > node2->lot_instance)
+ return (1);
+ if (node1->lot_instance < node2->lot_instance)
+ return (-1);
+ return (0);
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+ int i;
+
+ if (count != tbf->lot_attr_count)
+ return (1);
+
+ for (i = 0; i != count; i++) {
+ if (attrs[i] != tbf->lot_attrs[i])
+ return (1);
+ }
+ return (0);
+}
+
+#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+ int i;
+ uint64_t crc = -1ULL;
+
+ for (i = 0; i != attr_count; i++)
+ crc ^= SA_ATTR_HASH(attrs[i]);
+
+ return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+ int rc;
+ if (hdl->sa_spill == NULL) {
+ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+ &hdl->sa_spill)) == 0)
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ } else {
+ rc = 0;
+ }
+
+ return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ sa_data_op_t data_op, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int i;
+ int error = 0;
+ sa_buf_type_t buftypes;
+
+ buftypes = 0;
+
+ ASSERT(count > 0);
+ for (i = 0; i != count; i++) {
+ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+ bulk[i].sa_addr = NULL;
+ /* First check the bonus buffer */
+
+ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+ SA_GET_HDR(hdl, SA_BONUS),
+ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+ if (tx && !(buftypes & SA_BONUS)) {
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ buftypes |= SA_BONUS;
+ }
+ }
+ if (bulk[i].sa_addr == NULL &&
+ ((error = sa_get_spill(hdl)) == 0)) {
+ if (TOC_ATTR_PRESENT(
+ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+ SA_GET_HDR(hdl, SA_SPILL),
+ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+ if (tx && !(buftypes & SA_SPILL) &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+ buftypes |= SA_SPILL;
+ }
+ }
+ }
+ if (error && error != ENOENT) {
+ return ((error == ECKSUM) ? EIO : error);
+ }
+
+ switch (data_op) {
+ case SA_LOOKUP:
+ if (bulk[i].sa_addr == NULL)
+ return (ENOENT);
+ if (bulk[i].sa_data) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_addr, bulk[i].sa_data,
+ bulk[i].sa_size);
+ }
+ continue;
+
+ case SA_UPDATE:
+ /* existing rewrite of attr */
+ if (bulk[i].sa_addr &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_addr,
+ bulk[i].sa_length);
+ continue;
+ } else if (bulk[i].sa_addr) { /* attr size change */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_REPLACE, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ } else { /* adding new attribute */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_ADD, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ }
+ if (error)
+ return (error);
+ break;
+ }
+ }
+ return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, *findtb;
+ int i;
+ avl_index_t loc;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+ tb->lot_attr_count = attr_count;
+ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ tb->lot_num = lot_num;
+ tb->lot_hash = hash;
+ tb->lot_instance = 0;
+
+ if (zapadd) {
+ char attr_name[8];
+
+ if (sa->sa_layout_attr_obj == 0) {
+ sa->sa_layout_attr_obj = zap_create(os,
+ DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+ &sa->sa_layout_attr_obj, tx) == 0);
+ }
+
+ (void) snprintf(attr_name, sizeof (attr_name),
+ "%d", (int)lot_num);
+ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+ attr_name, 2, attr_count, attrs, tx));
+ }
+
+ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+ offsetof(sa_idx_tab_t, sa_next));
+
+ for (i = 0; i != attr_count; i++) {
+ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+ tb->lot_var_sizes++;
+ }
+
+ avl_add(&sa->sa_layout_num_tree, tb);
+
+ /* verify we don't have a hash collision */
+ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+ for (; findtb && findtb->lot_hash == hash;
+ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+ if (findtb->lot_instance != tb->lot_instance)
+ break;
+ tb->lot_instance++;
+ }
+ }
+ avl_add(&sa->sa_layout_hash_tree, tb);
+ return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+ int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+ sa_lot_t *tb, tbsearch;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&sa->sa_lock);
+ tbsearch.lot_hash = hash;
+ tbsearch.lot_instance = 0;
+ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+ if (tb) {
+ for (; tb && tb->lot_hash == hash;
+ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+ if (sa_layout_equal(tb, attrs, count) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ tb = sa_add_layout_entry(os, attrs, count,
+ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+ }
+ mutex_exit(&sa->sa_lock);
+ *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+ int error;
+ uint32_t blocksize;
+
+ if (size == 0) {
+ blocksize = SPA_MINBLOCKSIZE;
+ } else if (size > SPA_MAXBLOCKSIZE) {
+ ASSERT(0);
+ return (EFBIG);
+ } else {
+ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+ }
+
+ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+ if (func == NULL) {
+ bcopy(datastart, target, buflen);
+ } else {
+ boolean_t start;
+ int bytes;
+ void *dataptr;
+ void *saptr = target;
+ uint32_t length;
+
+ start = B_TRUE;
+ bytes = 0;
+ while (bytes < buflen) {
+ func(&dataptr, &length, buflen, start, datastart);
+ bcopy(dataptr, saptr, length);
+ saptr = (void *)((caddr_t)saptr + length);
+ bytes += length;
+ start = B_FALSE;
+ }
+ }
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary. It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
+ boolean_t *will_spill)
+{
+ int var_size = 0;
+ int i;
+ int full_space;
+ int hdrsize;
+ boolean_t done = B_FALSE;
+
+ if (buftype == SA_BONUS && sa->sa_force_spill) {
+ *total = 0;
+ *index = 0;
+ *will_spill = B_TRUE;
+ return (0);
+ }
+
+ *index = -1;
+ *total = 0;
+
+ if (buftype == SA_BONUS)
+ *will_spill = B_FALSE;
+
+ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+ sizeof (sa_hdr_phys_t);
+
+ full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+
+ for (i = 0; i != attr_count; i++) {
+ boolean_t is_var_sz;
+
+ *total += attr_desc[i].sa_length;
+ if (done)
+ goto next;
+
+ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+ if (is_var_sz) {
+ var_size++;
+ }
+
+ if (is_var_sz && var_size > 1) {
+ if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+ *total < full_space) {
+ hdrsize += sizeof (uint16_t);
+ } else {
+ done = B_TRUE;
+ *index = i;
+ if (buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ continue;
+ }
+ }
+
+ /*
+ * find index of where spill *could* occur.
+ * Then continue to count of remainder attribute
+ * space. The sum is used later for sizing bonus
+ * and spill buffer.
+ */
+ if (buftype == SA_BONUS && *index == -1 &&
+ P2ROUNDUP(*total + hdrsize, 8) >
+ (full_space - sizeof (blkptr_t))) {
+ *index = i;
+ done = B_TRUE;
+ }
+
+next:
+ if (P2ROUNDUP(*total + hdrsize, 8) > full_space &&
+ buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ }
+
+ hdrsize = P2ROUNDUP(hdrsize, 8);
+ return (hdrsize);
+}
+
+#define BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ uint64_t hash;
+ sa_buf_type_t buftype;
+ sa_hdr_phys_t *sahdr;
+ void *data_start;
+ int buf_space;
+ sa_attr_type_t *attrs, *attrs_start;
+ int i, lot_count;
+ int hdrsize, spillhdrsize;
+ int used;
+ dmu_object_type_t bonustype;
+ sa_lot_t *lot;
+ int len_idx;
+ int spill_used;
+ boolean_t spilling;
+
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+
+ /* first determine bonus header size and sum of all attributes */
+ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+ SA_BONUS, &i, &used, &spilling);
+
+ if (used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ used + hdrsize, tx));
+
+ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+ bonustype == DMU_OT_SA);
+
+ /* setup and size spill buffer when needed */
+ if (spilling) {
+ boolean_t dummy;
+
+ if (hdl->sa_spill == NULL) {
+ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+ &hdl->sa_spill) == 0);
+ }
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+ attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+ &spill_used, &dummy);
+
+ if (spill_used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+ hdl->sa_spill->db_size)
+ VERIFY(0 == sa_resize_spill(hdl,
+ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+ }
+
+ /* setup starting pointers to lay down data */
+ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+ buftype = SA_BONUS;
+
+ if (spilling)
+ buf_space = (sa->sa_force_spill) ?
+ 0 : SA_BLKPTR_SPACE - hdrsize;
+ else
+ buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ lot_count = 0;
+
+ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+ uint16_t length;
+
+ attrs[i] = attr_desc[i].sa_attr;
+ length = SA_REGISTERED_LEN(sa, attrs[i]);
+ if (length == 0)
+ length = attr_desc[i].sa_length;
+
+ if (buf_space < length) { /* switch to spill buffer */
+ VERIFY(bonustype == DMU_OT_SA);
+ if (buftype == SA_BONUS && !sa->sa_force_spill) {
+ sa_find_layout(hdl->sa_os, hash, attrs_start,
+ lot_count, tx, &lot);
+ SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+ }
+
+ buftype = SA_SPILL;
+ hash = -1ULL;
+ len_idx = 0;
+
+ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+ sahdr->sa_magic = SA_MAGIC;
+ data_start = (void *)((uintptr_t)sahdr +
+ spillhdrsize);
+ attrs_start = &attrs[i];
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ lot_count = 0;
+ }
+ hash ^= SA_ATTR_HASH(attrs[i]);
+ attr_desc[i].sa_addr = data_start;
+ attr_desc[i].sa_size = length;
+ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+ data_start, length);
+ if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+ sahdr->sa_lengths[len_idx++] = length;
+ }
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ length), 8);
+ buf_space -= P2ROUNDUP(length, 8);
+ lot_count++;
+ }
+
+ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+ /*
+ * Verify that old znodes always have layout number 0.
+ * Must be DMU_OT_SA for arbitrary layouts
+ */
+ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+ (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+ if (bonustype == DMU_OT_SA) {
+ SA_SET_HDR(sahdr, lot->lot_num,
+ buftype == SA_BONUS ? hdrsize : spillhdrsize);
+ }
+
+ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (!sa->sa_force_spill)
+ VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ if (!spilling) {
+ /*
+ * remove spill block that is no longer needed.
+ */
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+ sa_handle_object(hdl), tx));
+ } else {
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ }
+ }
+
+ return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+ int i;
+
+ if (sa->sa_attr_table == NULL)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+ sa_os_t *sa = os->os_sa;
+ uint64_t sa_attr_count = 0;
+ uint64_t sa_reg_count;
+ int error = 0;
+ uint64_t attr_value;
+ sa_attr_table_t *tb;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int registered_count = 0;
+ int i;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+
+ sa->sa_user_table =
+ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+ if (sa->sa_reg_attr_obj != 0) {
+ error = zap_count(os, sa->sa_reg_attr_obj,
+ &sa_attr_count);
+
+ /*
+ * Make sure we retrieved a count and that it isn't zero
+ */
+ if (error || (error == 0 && sa_attr_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto bail;
+ }
+ sa_reg_count = sa_attr_count;
+ }
+
+ if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+ sa_attr_count += sa_legacy_attr_count;
+
+ /* Allocate attribute numbers for attributes that aren't registered */
+ for (i = 0; i != count; i++) {
+ boolean_t found = B_FALSE;
+ int j;
+
+ if (ostype == DMU_OST_ZFS) {
+ for (j = 0; j != sa_legacy_attr_count; j++) {
+ if (strcmp(reg_attrs[i].sa_name,
+ sa_legacy_attrs[j].sa_name) == 0) {
+ sa->sa_user_table[i] =
+ sa_legacy_attrs[j].sa_attr;
+ found = B_TRUE;
+ }
+ }
+ }
+ if (found)
+ continue;
+
+ if (sa->sa_reg_attr_obj)
+ error = zap_lookup(os, sa->sa_reg_attr_obj,
+ reg_attrs[i].sa_name, 8, 1, &attr_value);
+ else
+ error = ENOENT;
+ switch (error) {
+ case ENOENT:
+ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+ sa_attr_count++;
+ break;
+ case 0:
+ sa->sa_user_table[i] = ATTR_NUM(attr_value);
+ break;
+ default:
+ goto bail;
+ }
+ }
+
+ sa->sa_num_attrs = sa_attr_count;
+ tb = sa->sa_attr_table =
+ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+ /*
+ * Attribute table is constructed from requested attribute list,
+ * previously foreign registered attributes, and also the legacy
+ * ZPL set of attributes.
+ */
+
+ if (sa->sa_reg_attr_obj) {
+ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t value;
+ value = za.za_first_integer;
+
+ registered_count++;
+ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+ tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+ if (tb[ATTR_NUM(value)].sa_name) {
+ continue;
+ }
+ tb[ATTR_NUM(value)].sa_name =
+ kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+ strlen(za.za_name) +1);
+ }
+ zap_cursor_fini(&zc);
+ /*
+ * Make sure we processed the correct number of registered
+ * attributes
+ */
+ if (registered_count != sa_reg_count) {
+ ASSERT(error != 0);
+ goto bail;
+ }
+
+ }
+
+ if (ostype == DMU_OST_ZFS) {
+ for (i = 0; i != sa_legacy_attr_count; i++) {
+ if (tb[i].sa_name)
+ continue;
+ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+ tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+ tb[i].sa_registered = B_FALSE;
+ tb[i].sa_name =
+ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+ KM_SLEEP);
+ (void) strlcpy(tb[i].sa_name,
+ sa_legacy_attrs[i].sa_name,
+ strlen(sa_legacy_attrs[i].sa_name) + 1);
+ }
+ }
+
+ for (i = 0; i != count; i++) {
+ sa_attr_type_t attr_id;
+
+ attr_id = sa->sa_user_table[i];
+ if (tb[attr_id].sa_name)
+ continue;
+
+ tb[attr_id].sa_length = reg_attrs[i].sa_length;
+ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+ tb[attr_id].sa_attr = attr_id;
+ tb[attr_id].sa_name =
+ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+ strlen(reg_attrs[i].sa_name) + 1);
+ }
+
+ sa->sa_need_attr_registration =
+ (sa_attr_count != registered_count);
+
+ return (0);
+bail:
+ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+ sa->sa_user_table = NULL;
+ sa_free_attr_table(sa);
+ return ((error != 0) ? error : EINVAL);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+ sa_attr_type_t **user_table)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ sa_os_t *sa;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+ sa_attr_type_t *tb;
+ int error;
+
+ mutex_enter(&os->os_lock);
+ if (os->os_sa) {
+ mutex_enter(&os->os_sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ tb = os->os_sa->sa_user_table;
+ mutex_exit(&os->os_sa->sa_lock);
+ *user_table = tb;
+ return (0);
+ }
+
+ sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+ mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ sa->sa_master_obj = sa_obj;
+
+ os->os_sa = sa;
+ mutex_enter(&sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+ if (sa_obj) {
+ error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+ 8, 1, &sa->sa_layout_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ error = zap_lookup(os, sa_obj, SA_REGISTRY,
+ 8, 1, &sa->sa_reg_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ }
+
+ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+ goto fail;
+
+ if (sa->sa_layout_attr_obj != 0) {
+ uint64_t layout_count;
+
+ error = zap_count(os, sa->sa_layout_attr_obj,
+ &layout_count);
+
+ /*
+ * Layout number count should be > 0
+ */
+ if (error || (error == 0 && layout_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto fail;
+ }
+
+ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ sa_attr_type_t *lot_attrs;
+ uint64_t lot_num;
+
+ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+ za.za_num_integers, KM_SLEEP);
+
+ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers,
+ lot_attrs))) != 0) {
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ break;
+ }
+ VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num) == 0);
+
+ (void) sa_add_layout_entry(os, lot_attrs,
+ za.za_num_integers, lot_num,
+ sa_layout_info_hash(lot_attrs,
+ za.za_num_integers), B_FALSE, NULL);
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ }
+ zap_cursor_fini(&zc);
+
+ /*
+ * Make sure layout count matches number of entries added
+ * to AVL tree
+ */
+ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+ ASSERT(error != 0);
+ goto fail;
+ }
+ }
+
+ /* Add special layout number for old ZNODES */
+ if (ostype == DMU_OST_ZFS) {
+ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+ sa_legacy_attr_count, 0,
+ sa_layout_info_hash(sa_legacy_zpl_layout,
+ sa_legacy_attr_count), B_FALSE, NULL);
+
+ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+ 0, B_FALSE, NULL);
+ }
+ *user_table = os->os_sa->sa_user_table;
+ mutex_exit(&sa->sa_lock);
+ return (0);
+fail:
+ os->os_sa = NULL;
+ sa_free_attr_table(sa);
+ if (sa->sa_user_table)
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+ mutex_exit(&sa->sa_lock);
+ kmem_free(sa, sizeof (sa_os_t));
+ return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *layout;
+ void *cookie;
+
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+ /* Free up attr table */
+
+ sa_free_attr_table(sa);
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+ sa_idx_tab_t *tab;
+ while (tab = list_head(&layout->lot_idx_tab)) {
+ ASSERT(refcount_count(&tab->sa_refcount));
+ sa_idx_tab_rele(os, tab);
+ }
+ }
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+ kmem_free(layout->lot_attrs,
+ sizeof (sa_attr_type_t) * layout->lot_attr_count);
+ kmem_free(layout, sizeof (sa_lot_t));
+ }
+
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+
+ kmem_free(sa, sizeof (sa_os_t));
+ os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+ sa_idx_tab_t *idx_tab = userp;
+
+ if (var_length) {
+ ASSERT(idx_tab->sa_variable_lengths);
+ idx_tab->sa_variable_lengths[length_idx] = length;
+ }
+ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+ sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+ void *data_start;
+ sa_lot_t *tb = tab;
+ sa_lot_t search;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ int i;
+ uint16_t *length_start = NULL;
+ uint8_t length_idx = 0;
+
+ if (tab == NULL) {
+ search.lot_num = SA_LAYOUT_NUM(hdr, type);
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+ ASSERT(tb);
+ }
+
+ if (IS_SA_BONUSTYPE(type)) {
+ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+ offsetof(sa_hdr_phys_t, sa_lengths) +
+ (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+ length_start = hdr->sa_lengths;
+ } else {
+ data_start = hdr;
+ }
+
+ for (i = 0; i != tb->lot_attr_count; i++) {
+ int attr_length, reg_length;
+ uint8_t idx_len;
+
+ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ if (reg_length) {
+ attr_length = reg_length;
+ idx_len = 0;
+ } else {
+ attr_length = length_start[length_idx];
+ idx_len = length_idx++;
+ }
+
+ func(hdr, data_start, tb->lot_attrs[i], attr_length,
+ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ attr_length), 8);
+ }
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+ sa_handle_t *hdl = userp;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+ dmu_buf_impl_t *db;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int num_lengths = 1;
+ int i;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ if (sa_hdr_phys->sa_magic == SA_MAGIC)
+ return;
+
+ db = SA_GET_DB(hdl, buftype);
+
+ if (buftype == SA_SPILL) {
+ arc_release(db->db_buf, NULL);
+ arc_buf_thaw(db->db_buf);
+ }
+
+ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+ /*
+ * Determine number of variable lenghts in header
+ * The standard 8 byte header has one for free and a
+ * 16 byte header would have 4 + 1;
+ */
+ if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+ for (i = 0; i != num_lengths; i++)
+ sa_hdr_phys->sa_lengths[i] =
+ BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+ sa_byteswap_cb, NULL, hdl);
+
+ if (buftype == SA_SPILL)
+ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys;
+ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_idx_tab_t *idx_tab;
+
+ sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+ mutex_enter(&sa->sa_lock);
+
+ /* Do we need to byteswap? */
+
+ /* only check if not old znode */
+ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+ sa_hdr_phys->sa_magic != 0) {
+ VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+ sa_byteswap(hdl, buftype);
+ }
+
+ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+ if (buftype == SA_BONUS)
+ hdl->sa_bonus_tab = idx_tab;
+ else
+ hdl->sa_spill_tab = idx_tab;
+
+ mutex_exit(&sa->sa_lock);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+sa_evict(dmu_buf_t *db, void *sap)
+{
+ panic("evicting sa dbuf %p\n", (void *)db);
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_idx_tab_t *idx_tab = arg;
+
+ if (idx_tab == NULL)
+ return;
+
+ mutex_enter(&sa->sa_lock);
+ if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+ if (idx_tab->sa_variable_lengths)
+ kmem_free(idx_tab->sa_variable_lengths,
+ sizeof (uint16_t) *
+ idx_tab->sa_layout->lot_var_sizes);
+ refcount_destroy(&idx_tab->sa_refcount);
+ kmem_free(idx_tab->sa_idx_tab,
+ sizeof (uint32_t) * sa->sa_num_attrs);
+ kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+ }
+ mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+ sa_os_t *sa = os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ (void) refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+ mutex_enter(&hdl->sa_lock);
+ (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
+ NULL, NULL, NULL);
+
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (hdl->sa_spill_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ hdl->sa_spill_tab = NULL;
+ }
+
+ dmu_buf_rele(hdl->sa_bonus, NULL);
+
+ if (hdl->sa_spill)
+ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+ mutex_exit(&hdl->sa_lock);
+
+ kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ int error = 0;
+ dmu_object_info_t doi;
+ sa_handle_t *handle;
+
+#ifdef ZFS_DEBUG
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+ doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+ /* find handle, if it exists */
+ /* if one doesn't exist then create a new one, and initialize it */
+
+ handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+ if (handle == NULL) {
+ sa_handle_t *newhandle;
+ handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_userp = userp;
+ handle->sa_bonus = db;
+ handle->sa_os = os;
+ handle->sa_spill = NULL;
+
+ error = sa_build_index(handle, SA_BONUS);
+ newhandle = (hdl_type == SA_HDL_SHARED) ?
+ dmu_buf_set_user_ie(db, handle,
+ NULL, sa_evict) : NULL;
+
+ if (newhandle != NULL) {
+ kmem_cache_free(sa_cache, handle);
+ handle = newhandle;
+ }
+ }
+ *handlepp = handle;
+
+ return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ dmu_buf_t *db;
+ int error;
+
+ if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+ return (error);
+
+ return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+ handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+ return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = buf;
+ bulk.sa_length = buflen;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_lookup_impl(hdl, &bulk, 1);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+ error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+ uio->uio_resid), UIO_READ, uio);
+ }
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+
+}
+#endif
+
+void *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
+{
+ sa_idx_tab_t *idx_tab;
+ sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, search;
+ avl_index_t loc;
+
+ /*
+ * Deterimine layout number. If SA node and header == 0 then
+ * force the index table to the dummy "1" empty layout.
+ *
+ * The layout number would only be zero for a newly created file
+ * that has not added any attributes yet, or with crypto enabled which
+ * doesn't write any attributes to the bonus buffer.
+ */
+
+ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+ /* Verify header size is consistent with layout information */
+ ASSERT(tb);
+ ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+ /*
+ * See if any of the already existing TOC entries can be reused?
+ */
+
+ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+ boolean_t valid_idx = B_TRUE;
+ int i;
+
+ if (tb->lot_var_sizes != 0 &&
+ idx_tab->sa_variable_lengths != NULL) {
+ for (i = 0; i != tb->lot_var_sizes; i++) {
+ if (hdr->sa_lengths[i] !=
+ idx_tab->sa_variable_lengths[i]) {
+ valid_idx = B_FALSE;
+ break;
+ }
+ }
+ }
+ if (valid_idx) {
+ sa_idx_tab_hold(os, idx_tab);
+ return (idx_tab);
+ }
+ }
+
+ /* No such luck, create a new entry */
+ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+ idx_tab->sa_idx_tab =
+ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+ idx_tab->sa_layout = tb;
+ refcount_create(&idx_tab->sa_refcount);
+ if (tb->lot_var_sizes)
+ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+ tb->lot_var_sizes, KM_SLEEP);
+
+ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+ tb, idx_tab);
+ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
+ sa_idx_tab_hold(os, idx_tab); /* one for layout */
+ list_insert_tail(&tb->lot_idx_tab, idx_tab);
+ return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+ boolean_t start, void *userdata)
+{
+ ASSERT(start);
+
+ *dataptr = userdata;
+ *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ uint64_t attr_value = 0;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_attr_table_t *tb = sa->sa_attr_table;
+ int i;
+
+ mutex_enter(&sa->sa_lock);
+
+ if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+ mutex_exit(&sa->sa_lock);
+ return;
+ }
+
+ if (sa->sa_reg_attr_obj == 0) {
+ sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
+ SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
+ }
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_registered)
+ continue;
+ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+ tb[i].sa_byteswap);
+ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+ tb[i].sa_name, 8, 1, &attr_value, tx));
+ tb[i].sa_registered = B_TRUE;
+ }
+ sa->sa_need_attr_registration = B_FALSE;
+ mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file. It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+ return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_replace_all_by_template_locked(hdl, attr_desc,
+ attr_count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * add/remove/replace a single attribute and then rewrite the entire set
+ * of attributes.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+ sa_bulk_attr_t *attr_desc;
+ void *old_data[2];
+ int bonus_attr_count = 0;
+ int bonus_data_size, spill_data_size;
+ int spill_attr_count = 0;
+ int error;
+ uint16_t length;
+ int i, j, k, length_idx;
+ sa_hdr_phys_t *hdr;
+ sa_idx_tab_t *idx_tab;
+ int attr_count;
+ int count;
+
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* First make of copy of the old data */
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
+ bonus_data_size = hdl->sa_bonus->db_size;
+ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+ bcopy(hdl->sa_bonus->db_data, old_data[0],
+ hdl->sa_bonus->db_size);
+ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[0] = NULL;
+ }
+ DB_DNODE_EXIT(db);
+
+ /* Bring spill buffer online if it isn't currently */
+
+ if ((error = sa_get_spill(hdl)) == 0) {
+ spill_data_size = hdl->sa_spill->db_size;
+ old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+ bcopy(hdl->sa_spill->db_data, old_data[1],
+ hdl->sa_spill->db_size);
+ spill_attr_count =
+ hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else if (error && error != ENOENT) {
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ return (error);
+ } else {
+ old_data[1] = NULL;
+ }
+
+ /* build descriptor of all attributes */
+
+ attr_count = bonus_attr_count + spill_attr_count;
+ if (action == SA_ADD)
+ attr_count++;
+ else if (action == SA_REMOVE)
+ attr_count--;
+
+ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+ /*
+ * loop through bonus and spill buffer if it exists, and
+ * build up new attr_descriptor to reset the attributes
+ */
+ k = j = 0;
+ count = bonus_attr_count;
+ hdr = SA_GET_HDR(hdl, SA_BONUS);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+ for (; k != 2; k++) {
+ /* iterate over each attribute in layout */
+ for (i = 0, length_idx = 0; i != count; i++) {
+ sa_attr_type_t attr;
+
+ attr = idx_tab->sa_layout->lot_attrs[i];
+ if (attr == newattr) {
+ if (action == SA_REMOVE) {
+ j++;
+ continue;
+ }
+ ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
+ ASSERT(action == SA_REPLACE);
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ length = SA_REGISTERED_LEN(sa, attr);
+ if (length == 0) {
+ length = hdr->sa_lengths[length_idx++];
+ }
+
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ NULL, (void *)
+ (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+ (uintptr_t)old_data[k]), length);
+ }
+ }
+ if (k == 0 && hdl->sa_spill) {
+ hdr = SA_GET_HDR(hdl, SA_SPILL);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+ count = spill_attr_count;
+ } else {
+ break;
+ }
+ }
+ if (action == SA_ADD) {
+ length = SA_REGISTERED_LEN(sa, newattr);
+ if (length == 0) {
+ length = buflen;
+ }
+ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+ datastart, buflen);
+ }
+
+ error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ if (old_data[1])
+ kmem_free(old_data[1], spill_data_size);
+ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+ return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ dmu_tx_t *tx)
+{
+ int error;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_object_type_t bonustype;
+
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* sync out registration table if necessary */
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+
+ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+ sa->sa_update_cb(hdl, tx);
+
+ return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+ void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = type;
+ bulk.sa_data_func = NULL;
+ bulk.sa_length = buflen;
+ bulk.sa_data = buf;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+ uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = userdata;
+ bulk.sa_data_func = locator;
+ bulk.sa_length = buflen;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+ sa_bulk_attr_t bulk;
+ int error;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+ }
+ *size = bulk.sa_size;
+
+ mutex_exit(&hdl->sa_lock);
+ return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_lookup_locked(hdl, attrs, count);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, attrs, count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+ NULL, 0, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+ dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+ blksize, nblocks);
+}
+
+void
+sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
+{
+ (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
+ oldhdl, newhdl, NULL, sa_evict);
+ oldhdl->sa_bonus = NULL;
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+ hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+ return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+ return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+ os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+ mutex_enter(&os->os_sa->sa_lock);
+ sa_register_update_callback_locked(os, func);
+ mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+ return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+ sa_os_t *sa = os->os_sa;
+
+ if (sa->sa_master_obj)
+ return (1);
+
+ sa->sa_master_obj = sa_object;
+
+ return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+ sa_hdr_phys_t *hdr = arg;
+
+ return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_exit(&hdl->sa_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
index ca7076cb6fd9..816c09aa0371 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -19,111 +19,36 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-3, available at:
- * http://csrc.nist.gov/publications/PubsFIPS.html
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
- *
- * Ch(x, y, z) (x & y) ^ (~x & z)
- * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z)
- *
- * We use equivalent logical reductions here that require one less op.
- */
-#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
-#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
- uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
- for (t = 0; t < 16; t++, cp += 4)
- W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
- for (t = 16; t < 64; t++)
- W[t] = sigma1(W[t - 2]) + W[t - 7] +
- sigma0(W[t - 15]) + W[t - 16];
-
- a = H[0]; b = H[1]; c = H[2]; d = H[3];
- e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
- for (t = 0; t < 64; t++) {
- T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
- T2 = SIGMA0(a) + Maj(a, b, c);
- h = g; g = f; f = e; e = d + T1;
- d = c; c = b; b = a; a = T1 + T2;
- }
-
- H[0] += a; H[1] += b; H[2] += c; H[3] += d;
- H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
+#ifdef _KERNEL
+#include <crypto/sha2/sha2.h>
+#else
+#include <sha256.h>
+#endif
void
zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
- uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
- 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
- uint8_t pad[128];
- int i, padsize;
-
- for (i = 0; i < (size & ~63ULL); i += 64)
- SHA256Transform(H, (uint8_t *)buf + i);
-
- for (padsize = 0; i < size; i++)
- pad[padsize++] = *((uint8_t *)buf + i);
-
- for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
- pad[padsize] = 0;
-
- for (i = 56; i >= 0; i -= 8)
- pad[padsize++] = (size << 3) >> i;
-
- for (i = 0; i < padsize; i += 64)
- SHA256Transform(H, pad + i);
-
- ZIO_SET_CHECKSUM(zcp,
- (uint64_t)H[0] << 32 | H[1],
- (uint64_t)H[2] << 32 | H[3],
- (uint64_t)H[4] << 32 | H[5],
- (uint64_t)H[6] << 32 | H[7]);
+ SHA256_CTX ctx;
+ zio_cksum_t tmp;
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, buf, size);
+ SHA256_Final((unsigned char *)&tmp, &ctx);
+
+ /*
+ * A prior implementation of this function had a
+ * private SHA256 implementation always wrote things out in
+ * Big Endian and there wasn't a byteswap variant of it.
+ * To preseve on disk compatibility we need to force that
+ * behaviour.
+ */
+ zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index c04102e17e84..9336a6b516c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -35,13 +34,14 @@
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
+#include <sys/ddt.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
@@ -56,8 +56,16 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/callb.h>
-#include <sys/sunddi.h>
#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/zone.h>
+#endif /* _KERNEL */
#include "zfs_prop.h"
#include "zfs_comutil.h"
@@ -70,17 +78,17 @@ TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
"Check hostid on import?");
-enum zti_modes {
+typedef enum zti_modes {
zti_mode_fixed, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */
- zti_mode_tune, /* fill from zio_taskq_tune_* */
+ zti_mode_batch, /* cpu-intensive; value is ignored */
zti_mode_null, /* don't create a taskq */
zti_nmodes
-};
+} zti_modes_t;
#define ZTI_FIX(n) { zti_mode_fixed, (n) }
#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
-#define ZTI_TUNE { zti_mode_tune, 0 }
+#define ZTI_BATCH { zti_mode_batch, 0 }
#define ZTI_NULL { zti_mode_null, 0 }
#define ZTI_ONE ZTI_FIX(1)
@@ -91,7 +99,7 @@ typedef struct zio_taskq_info {
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
- "issue", "issue_high", "intr", "intr_high"
+ "issue", "issue_high", "intr", "intr_high"
};
/*
@@ -101,18 +109,36 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
- { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL },
- { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
+ { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
+ { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
};
-enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
-uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */
-
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static dsl_syncfunc_t spa_sync_props;
static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
+
+uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
+#ifdef PSRSET_BIND
+id_t zio_taskq_psrset_bind = PS_NONE;
+#endif
+#ifdef SYSDC
+boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+#endif
+uint_t zio_taskq_basedc = 80; /* base duty cycle */
+
+boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
/*
* ==========================================================================
@@ -149,7 +175,7 @@ static void
spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
{
uint64_t size;
- uint64_t used;
+ uint64_t alloc;
uint64_t cap, version;
zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp;
@@ -157,17 +183,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (spa->spa_root_vdev != NULL) {
- size = spa_get_space(spa);
- used = spa_get_alloc(spa);
+ alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ size = metaslab_class_get_space(spa_normal_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
- spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
- spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
- size - used, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+ size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+ (spa_mode(spa) == FREAD), src);
- cap = (size == 0) ? 0 : (used * 100 / size);
+ cap = (size == 0) ? 0 : (alloc * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+ ddt_get_pool_dedup_ratio(spa), src);
+
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
spa->spa_root_vdev->vdev_state, src);
@@ -202,9 +233,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
int
spa_prop_get(spa_t *spa, nvlist_t **nvp)
{
+ objset_t *mos = spa->spa_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
- objset_t *mos = spa->spa_meta_objset;
int err;
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -217,7 +248,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
spa_prop_get_config(spa, nvp);
/* If no pool property object, no more prop to get. */
- if (spa->spa_pool_props_object == 0) {
+ if (mos == NULL || spa->spa_pool_props_object == 0) {
mutex_exit(&spa->spa_props_lock);
return (0);
}
@@ -338,6 +369,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
+ case ZPOOL_PROP_AUTOEXPAND:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = EINVAL;
@@ -375,12 +407,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
break;
}
- if (error = dmu_objset_open(strval, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os))
+ if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* We don't support gzip bootable datasets */
- if ((error = dsl_prop_get_integer(strval,
+ /* Must be ZPL and not gzip compressed. */
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ error = ENOTSUP;
+ } else if ((error = dsl_prop_get_integer(strval,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
&compress, NULL)) == 0 &&
!BOOTFS_COMPRESS_VALID(compress)) {
@@ -388,7 +422,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
} else {
objnum = dmu_objset_id(os);
}
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
break;
@@ -436,6 +470,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
strcmp(slash, "/..") == 0)
error = EINVAL;
break;
+
+ case ZPOOL_PROP_DEDUPDITTO:
+ if (spa_version(spa) < SPA_VERSION_DEDUP)
+ error = ENOTSUP;
+ else
+ error = nvpair_value_uint64(elem, &intval);
+ if (error == 0 &&
+ intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+ error = EINVAL;
+ break;
}
if (error)
@@ -497,7 +541,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
nvpair_name(elem))) == ZPROP_INVAL)
return (EINVAL);
- if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+ if (prop == ZPOOL_PROP_CACHEFILE ||
+ prop == ZPOOL_PROP_ALTROOT ||
+ prop == ZPOOL_PROP_READONLY)
continue;
need_sync = B_TRUE;
@@ -569,20 +615,55 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
offsetof(spa_error_entry_t, se_avl));
}
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa, int mode)
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+ uint_t value)
{
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ uint_t flags = TASKQ_PREPOPULATE;
+ boolean_t batch = B_FALSE;
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_mode = mode;
+ switch (mode) {
+ case zti_mode_null:
+ return (NULL); /* no taskq needed */
+
+ case zti_mode_fixed:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ break;
- spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ case zti_mode_batch:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = zio_taskq_batch_pct;
+ break;
+ case zti_mode_online_percent:
+ flags |= TASKQ_THREADS_CPU_PCT;
+ break;
+
+ default:
+ panic("unrecognized mode for %s taskq (%u:%u) in "
+ "spa_activate()",
+ name, mode, value);
+ break;
+ }
+
+#ifdef SYSDC
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
+
+ return (taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags));
+ }
+#endif
+ return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+ spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
@@ -593,43 +674,137 @@ spa_activate(spa_t *spa, int mode)
(void) snprintf(name, sizeof (name),
"%s_%s", zio_type_name[t], zio_taskq_types[q]);
- if (mode == zti_mode_tune) {
- mode = zio_taskq_tune_mode;
- value = zio_taskq_tune_value;
- if (mode == zti_mode_tune)
- mode = zti_mode_online_percent;
- }
+ spa->spa_zio_taskq[t][q] =
+ spa_taskq_create(spa, name, mode, value);
+ }
+ }
+}
+
+#ifdef _KERNEL
+#ifdef SPA_PROCESS
+static void
+spa_thread(void *arg)
+{
+ callb_cpr_t cprinfo;
- switch (mode) {
- case zti_mode_fixed:
- ASSERT3U(value, >=, 1);
- value = MAX(value, 1);
+ spa_t *spa = arg;
+ user_t *pu = PTOU(curproc);
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+ spa->spa_name);
+
+ ASSERT(curproc != &p0);
+ (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+ "zpool-%s", spa->spa_name);
+ (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+#ifdef PSRSET_BIND
+ /* bind this thread to the requested psrset */
+ if (zio_taskq_psrset_bind != PS_NONE) {
+ pool_lock();
+ mutex_enter(&cpu_lock);
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+ 0, NULL, NULL) == 0) {
+ curthread->t_bind_pset = zio_taskq_psrset_bind;
+ } else {
+ cmn_err(CE_WARN,
+ "Couldn't bind process for zfs pool \"%s\" to "
+ "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+ }
- spa->spa_zio_taskq[t][q] = taskq_create(name,
- value, maxclsyspri, 50, INT_MAX,
- TASKQ_PREPOPULATE);
- break;
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+ }
+#endif
- case zti_mode_online_percent:
- spa->spa_zio_taskq[t][q] = taskq_create(name,
- value, maxclsyspri, 50, INT_MAX,
- TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
- break;
+#ifdef SYSDC
+ if (zio_taskq_sysdc) {
+ sysdc_thread_enter(curthread, 100, 0);
+ }
+#endif
- case zti_mode_null:
- spa->spa_zio_taskq[t][q] = NULL;
- break;
+ spa->spa_proc = curproc;
+ spa->spa_did = curthread->t_did;
- case zti_mode_tune:
- default:
- panic("unrecognized mode for "
- "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
- "in spa_activate()",
- t, q, mode, value);
- break;
+ spa_create_zio_taskqs(spa);
+
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+ spa->spa_proc_state = SPA_PROC_ACTIVE;
+ cv_broadcast(&spa->spa_proc_cv);
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+ ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+ spa->spa_proc_state = SPA_PROC_GONE;
+ spa->spa_proc = &p0;
+ cv_broadcast(&spa->spa_proc_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
+
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+}
+#endif /* SPA_PROCESS */
+#endif
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, int mode)
+{
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_mode = mode;
+
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+ /* Try to create a covering process */
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+ ASSERT(spa->spa_proc == &p0);
+ spa->spa_did = 0;
+
+#ifdef SPA_PROCESS
+ /* Only create a process if we're going to be around a while. */
+ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+ if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+ NULL, 0) == 0) {
+ spa->spa_proc_state = SPA_PROC_CREATED;
+ while (spa->spa_proc_state == SPA_PROC_CREATED) {
+ cv_wait(&spa->spa_proc_cv,
+ &spa->spa_proc_lock);
}
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ ASSERT(spa->spa_proc != &p0);
+ ASSERT(spa->spa_did != 0);
+ } else {
+#ifdef _KERNEL
+ cmn_err(CE_WARN,
+ "Couldn't create process for zfs pool \"%s\"\n",
+ spa->spa_name);
+#endif
}
}
+#endif /* SPA_PROCESS */
+ mutex_exit(&spa->spa_proc_lock);
+
+ /* If we didn't create a process, we need to create our taskqs. */
+ ASSERT(spa->spa_proc == &p0);
+ if (spa->spa_proc == &p0) {
+ spa_create_zio_taskqs(spa);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
@@ -688,6 +863,33 @@ spa_deactivate(spa_t *spa)
avl_destroy(&spa->spa_errlist_last);
spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+ mutex_enter(&spa->spa_proc_lock);
+ if (spa->spa_proc_state != SPA_PROC_NONE) {
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+ cv_broadcast(&spa->spa_proc_cv);
+ while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+ ASSERT(spa->spa_proc != &p0);
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+ spa->spa_proc_state = SPA_PROC_NONE;
+ }
+ ASSERT(spa->spa_proc == &p0);
+ mutex_exit(&spa->spa_proc_lock);
+
+#ifdef SPA_PROCESS
+ /*
+ * We want to make sure spa_thread() has actually exited the ZFS
+ * module, so that the module can't be unloaded out from underneath
+ * it.
+ */
+ if (spa->spa_did != 0) {
+ thread_join(spa->spa_did);
+ spa->spa_did = 0;
+ }
+#endif /* SPA_PROCESS */
}
/*
@@ -701,7 +903,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
uint_t id, int atype)
{
nvlist_t **child;
- uint_t c, children;
+ uint_t children;
int error;
if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
@@ -722,7 +924,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
return (EINVAL);
}
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vdev_t *vd;
if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
atype)) != 0) {
@@ -768,14 +970,19 @@ spa_unload(spa_t *spa)
spa->spa_async_zio_root = NULL;
}
+ bpobj_close(&spa->spa_deferred_bpobj);
+
/*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
dsl_pool_close(spa->spa_dsl_pool);
spa->spa_dsl_pool = NULL;
+ spa->spa_meta_objset = NULL;
}
+ ddt_unload(spa);
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -928,7 +1135,7 @@ spa_load_spares(spa_t *spa)
KM_SLEEP);
for (i = 0; i < spa->spa_spares.sav_count; i++)
spares[i] = vdev_config_generate(spa,
- spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -950,7 +1157,7 @@ spa_load_l2cache(spa_t *spa)
nvlist_t **l2cache;
uint_t nl2cache;
int i, j, oldnvdevs;
- uint64_t guid, size;
+ uint64_t guid;
vdev_t *vd, **oldvdevs, **newvdevs;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
@@ -1014,12 +1221,8 @@ spa_load_l2cache(spa_t *spa)
(void) vdev_validate_aux(vd);
- if (!vdev_is_dead(vd)) {
- size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ if (!vdev_is_dead(vd))
+ l2arc_add_vdev(spa, vd);
}
}
@@ -1058,7 +1261,7 @@ spa_load_l2cache(spa_t *spa)
l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
- sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
out:
@@ -1098,9 +1301,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
static void
spa_check_removed(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
spa_check_removed(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
@@ -1110,36 +1311,131 @@ spa_check_removed(vdev_t *vd)
}
/*
- * Load the slog device state from the config object since it's possible
- * that the label does not contain the most up-to-date information.
+ * Validate the current config against the MOS config
*/
-void
-spa_load_log_state(spa_t *spa)
+static boolean_t
+spa_config_valid(spa_t *spa, nvlist_t *config)
{
- nvlist_t *nv, *nvroot, **child;
- uint64_t is_log;
- uint_t children, c;
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv;
- VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
- for (c = 0; c < children; c++) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+ ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+
+ /*
+ * If we're doing a normal import, then build up any additional
+ * diagnostic information about missing devices in this config.
+ * We'll pass this up to the user for further processing.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+ nvlist_t **child, *nv;
+ uint64_t idx = 0;
+
+ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+ KM_SLEEP);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops &&
+ mtvd->vdev_islog)
+ child[idx++] = vdev_config_generate(spa, mtvd,
+ B_FALSE, 0);
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+
+ for (int i = 0; i < idx; i++)
+ nvlist_free(child[i]);
+ }
+ nvlist_free(nv);
+ kmem_free(child, rvd->vdev_children * sizeof (char **));
+ }
+
+ /*
+ * Compare the root vdev tree with the information we have
+ * from the MOS config (mrvd). Check each top-level vdev
+ * with the corresponding MOS config top-level (mtvd).
+ */
+ for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ /*
+ * Resolve any "missing" vdevs in the current configuration.
+ * If we find that the MOS config has more accurate information
+ * about the top-level vdev then use that vdev instead.
+ */
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops) {
- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log) == 0 && is_log)
- vdev_load_log_state(tvd, child[c]);
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
+ continue;
+
+ /*
+ * Device specific actions.
+ */
+ if (mtvd->vdev_islog) {
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ /*
+ * XXX - once we have 'readonly' pool
+ * support we should be able to handle
+ * missing data devices by transitioning
+ * the pool to readonly.
+ */
+ continue;
+ }
+
+ /*
+ * Swap the missing vdev with the data we were
+ * able to obtain from the MOS config.
+ */
+ vdev_remove_child(rvd, tvd);
+ vdev_remove_child(mrvd, mtvd);
+
+ vdev_add_child(rvd, mtvd);
+ vdev_add_child(mrvd, tvd);
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ vdev_load(mtvd);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ vdev_reopen(rvd);
+ } else if (mtvd->vdev_islog) {
+ /*
+ * Load the slog device's state from the MOS config
+ * since it's possible that the label does not
+ * contain the most up-to-date information.
+ */
+ vdev_load_log_state(tvd, mtvd);
+ vdev_reopen(tvd);
+ }
}
- nvlist_free(nv);
+ vdev_free(mrvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Ensure we were able to validate the config.
+ */
+ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
}
/*
* Check for missing log devices
*/
-int
+static int
spa_check_logs(spa_t *spa)
{
switch (spa->spa_log_state) {
@@ -1148,7 +1444,7 @@ spa_check_logs(spa_t *spa)
case SPA_LOG_UNKNOWN:
if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
DS_FIND_CHILDREN)) {
- spa->spa_log_state = SPA_LOG_MISSING;
+ spa_set_log_state(spa, SPA_LOG_MISSING);
return (1);
}
break;
@@ -1156,47 +1452,310 @@ spa_check_logs(spa_t *spa)
return (0);
}
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t slog_found = B_FALSE;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ if (!spa_has_slogs(spa))
+ return (B_FALSE);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog) {
+ metaslab_group_passivate(mg);
+ slog_found = B_TRUE;
+ }
+ }
+
+ return (slog_found);
+}
+
+static void
+spa_activate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog)
+ metaslab_group_activate(mg);
+ }
+}
+
+int
+spa_offline_log(spa_t *spa)
+{
+ int error = 0;
+
+ if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ NULL, DS_FIND_CHILDREN)) == 0) {
+
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+}
+
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+ int i;
+
+ for (i = 0; i < sav->sav_count; i++)
+ spa_check_removed(sav->sav_vdevs[i]);
+}
+
+void
+spa_claim_notify(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ if (zio->io_error)
+ return;
+
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+typedef struct spa_load_error {
+ uint64_t sle_meta_count;
+ uint64_t sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ spa_load_error_t *sle = zio->io_private;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ int error = zio->io_error;
+
+ if (error) {
+ if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+ type != DMU_OT_INTENT_LOG)
+ atomic_add_64(&sle->sle_meta_count, 1);
+ else
+ atomic_add_64(&sle->sle_data_count, 1);
+ }
+ zio_data_buf_free(zio->io_data, zio->io_size);
+}
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ if (bp != NULL) {
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = zio_data_buf_alloc(size);
+
+ zio_nowait(zio_read(rio, spa, bp, data, size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ }
+ return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+ zio_t *rio;
+ spa_load_error_t sle = { 0 };
+ zpool_rewind_policy_t policy;
+ boolean_t verify_ok = B_FALSE;
+ int error;
+
+ zpool_get_rewind_policy(spa->spa_config, &policy);
+
+ if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+ return (0);
+
+ rio = zio_root(spa, NULL, &sle,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+
+ (void) zio_wait(rio);
+
+ spa->spa_load_meta_errors = sle.sle_meta_count;
+ spa->spa_load_data_errors = sle.sle_data_count;
+
+ if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+ sle.sle_data_count <= policy.zrp_maxdata) {
+ int64_t loss = 0;
+
+ verify_ok = B_TRUE;
+ spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+ VERIFY(nvlist_add_int64(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
+ } else {
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ }
+
+ if (error) {
+ if (error != ENXIO && error != EIO)
+ error = EIO;
+ return (error);
+ }
+
+ return (verify_ok ? 0 : EIO);
+}
+
/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
*/
static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
{
- int error = 0;
- nvlist_t *nvroot = NULL;
- vdev_t *rvd;
- uberblock_t *ub = &spa->spa_uberblock;
- uint64_t config_cache_txg = spa->spa_config_txg;
- uint64_t pool_guid;
- uint64_t version;
- uint64_t autoreplace = 0;
- int orig_mode = spa->spa_mode;
- char *ereport = FM_EREPORT_ZFS_POOL;
+ return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val));
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (err);
+}
+
+/*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
+ }
+ }
/*
- * If this is an untrusted config, access the pool in read-only mode.
- * This prevents things like resilvering recently removed devices.
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
*/
- if (!mosconfig)
- spa->spa_mode = FREAD;
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+}
- spa->spa_load_state = state;
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+ boolean_t mosconfig)
+{
+ nvlist_t *config = spa->spa_config;
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ int error;
+ uint64_t pool_guid;
+ nvlist_t *nvl;
- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
- error = EINVAL;
- goto out;
- }
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ return (EINVAL);
/*
* Versioning wasn't explicitly added to the label until later, so if
* it's not present treat it as the initial version.
*/
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
- version = SPA_VERSION_INITIAL;
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
&spa->spa_config_txg);
@@ -1204,10 +1763,70 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
spa_guid_exists(pool_guid, 0)) {
error = EEXIST;
- goto out;
+ } else {
+ spa->spa_load_guid = pool_guid;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ &nvl) == 0) {
+ VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+ KM_SLEEP) == 0);
+ }
+
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, pool_guid, config, state, type,
+ mosconfig, &ereport);
}
- spa->spa_load_guid = pool_guid;
+ spa->spa_minref = refcount_count(&spa->spa_refcount);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ }
+ }
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t children, config_cache_txg = spa->spa_config_txg;
+ int orig_mode = spa->spa_mode;
+ int parse;
+ uint64_t obj;
+
+ /*
+ * If this is an untrusted config, access the pool in read-only mode.
+ * This prevents things like resilvering recently removed devices.
+ */
+ if (!mosconfig)
+ spa->spa_mode = FREAD;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+ return (EINVAL);
+
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
/*
* Create "The Godfather" zio to hold all async IOs
@@ -1221,15 +1840,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* configuration requires knowing the version number.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- spa->spa_ubsync.ub_version = version;
- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
- goto out;
+ return (error);
ASSERT(spa->spa_root_vdev == rvd);
- ASSERT(spa_guid(spa) == pool_guid);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
/*
* Try to open all vdevs, loading each label in the process.
@@ -1238,26 +1859,31 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
error = vdev_open(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
- goto out;
+ return (error);
/*
* We need to validate the vdev labels against the configuration that
* we have in hand, which is dependent on the setting of mosconfig. If
* mosconfig is true then we're validating the vdev labels based on
- * that config. Otherwise, we're validating against the cached config
+ * that config. Otherwise, we're validating against the cached config
* (zpool.cache) that was read when we loaded the zfs module, and then
* later we will recursively call spa_load() and validate against
* the vdev config.
+ *
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
*/
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0)
- goto out;
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
+ if (error != 0)
+ return (error);
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
}
/*
@@ -1268,32 +1894,33 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* If we weren't able to find a single valid uberblock, return failure.
*/
- if (ub->ub_txg == 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = ENXIO;
- goto out;
- }
+ if (ub->ub_txg == 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
/*
* If the pool is newer than the code, we can't open it.
*/
- if (ub->ub_version > SPA_VERSION) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_VERSION_NEWER);
- error = ENOTSUP;
- goto out;
- }
+ if (ub->ub_version > SPA_VERSION)
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
/*
* If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration.
+ * incomplete configuration. We first check to see if the pool
+ * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+ * If it is, defer the vdev_guid_sum check till later so we
+ * can handle missing vdevs.
*/
- if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_GUID_SUM);
- error = ENXIO;
- goto out;
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ rvd->vdev_guid_sum != ub->ub_guid_sum)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
}
/*
@@ -1301,221 +1928,174 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
*/
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_ubsync = spa->spa_uberblock;
- spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+
error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
- if (error) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- goto out;
- }
+ if (error)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (!mosconfig) {
- nvlist_t *newconfig;
uint64_t hostid;
+ nvlist_t *policy = NULL, *nvconfig;
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
- VERIFY(nvlist_lookup_string(newconfig,
+ VERIFY(nvlist_lookup_string(nvconfig,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
+#ifdef _KERNEL
+ myhostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so
+ * we can't use zone_get_hostid().
+ */
(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+#endif /* _KERNEL */
if (check_hostid && hostid != 0 && myhostid != 0 &&
- (unsigned long)hostid != myhostid) {
+ hostid != myhostid) {
+ nvlist_free(nvconfig);
cmn_err(CE_WARN, "pool '%s' could not be "
"loaded as it was last accessed by "
"another system (host: %s hostid: 0x%lx). "
"See: http://www.sun.com/msg/ZFS-8000-EY",
spa_name(spa), hostname,
(unsigned long)hostid);
- error = EBADF;
- goto out;
+ return (EBADF);
}
}
+ if (nvlist_lookup_nvlist(spa->spa_config,
+ ZPOOL_REWIND_POLICY, &policy) == 0)
+ VERIFY(nvlist_add_nvlist(nvconfig,
+ ZPOOL_REWIND_POLICY, policy) == 0);
- spa_config_set(spa, newconfig);
+ spa_config_set(spa, nvconfig);
spa_unload(spa);
spa_deactivate(spa);
spa_activate(spa, orig_mode);
- return (spa_load(spa, newconfig, state, B_TRUE));
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the bit that tells us to use the new accounting function
* (raid-z deflation). If we have an older pool, this will not
* be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
- sizeof (uint64_t), 1, &spa->spa_errlog_last);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
- sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+ &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the history object. If we have an older pool, this
* will not be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
- sizeof (uint64_t), 1, &spa->spa_history);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * If we're assembling the pool from the split-off vdevs of
+ * an existing pool, we don't want to attach the spares & cache
+ * devices.
+ */
/*
* Load any hot spares for this pool.
*/
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
- if (error == 0) {
+ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
if (load_nvlist(spa, spa->spa_spares.sav_object,
- &spa->spa_spares.sav_config) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ &spa->spa_spares.sav_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_spares.sav_sync = B_TRUE;
}
/*
* Load any level 2 ARC devices for this pool.
*/
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+ error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
&spa->spa_l2cache.sav_object);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
- if (error == 0) {
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
if (load_nvlist(spa, spa->spa_l2cache.sav_object,
- &spa->spa_l2cache.sav_config) != 0) {
- vdev_set_state(rvd, B_TRUE,
- VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ &spa->spa_l2cache.sav_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_l2cache.sav_sync = B_TRUE;
}
- spa_load_log_state(spa);
-
- if (spa_check_logs(spa)) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LOG);
- error = ENXIO;
- ereport = FM_EREPORT_ZFS_LOG_REPLAY;
- goto out;
- }
-
-
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
- if (error && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
+ if (error && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (error == 0) {
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
- sizeof (uint64_t), 1, &spa->spa_bootfs);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
- sizeof (uint64_t), 1, &autoreplace);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
- sizeof (uint64_t), 1, &spa->spa_delegation);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
- sizeof (uint64_t), 1, &spa->spa_failmode);
+ uint64_t autoreplace;
+
+ spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+ spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+ spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+ &spa->spa_dedup_ditto);
+
+ spa->spa_autoreplace = (autoreplace != 0);
}
/*
@@ -1525,8 +2105,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* unopenable vdevs so that the normal autoreplace handler can take
* over.
*/
- if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
spa_check_removed(spa->spa_root_vdev);
+ /*
+ * For the import case, this is done in spa_import(), because
+ * at this point we're using the spare definitions from
+ * the MOS config, not necessarily from the userland config.
+ */
+ if (state != SPA_LOAD_IMPORT) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+ }
/*
* Load the vdev state for all toplevel vdevs.
@@ -1541,15 +2131,60 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, SCL_ALL, FTAG);
/*
- * Check the state of the root vdev. If it can't be opened, it
- * indicates one or more toplevel vdevs are faulted.
+ * Load the DDTs (dedup tables).
*/
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
+ error = ddt_load(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ spa_update_dspace(spa);
+
+ /*
+ * Validate the config, using the MOS config to fill in any
+ * information which might be missing. If we fail to validate
+ * the config then declare the pool unfit for use. If we're
+ * assembling a pool from a split, the log is not transferred
+ * over.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ nvlist_t *nvconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!spa_config_valid(spa, nvconfig)) {
+ nvlist_free(nvconfig);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+ nvlist_free(nvconfig);
+
+ /*
+ * Now that we've validate the config, check the state of the
+ * root vdev. If it can't be opened, it indicates one or
+ * more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+
+ if (spa_check_logs(spa)) {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+ }
}
- if (spa_writeable(spa)) {
+ /*
+ * We've successfully opened the pool, verify that we're ready
+ * to start pushing transactions.
+ */
+ if (state != SPA_LOAD_TRYIMPORT) {
+ if (error = spa_load_verify(spa))
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
+ }
+
+ if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+ spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
@@ -1558,31 +2193,44 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* Claim log blocks that haven't been committed yet.
* This must all happen in a single txg.
+ * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+ * invoked from zil_claim_log_block()'s i/o done callback.
+ * Price of rollback is that we abandon the log.
*/
+ spa->spa_claiming = B_TRUE;
+
tx = dmu_tx_create_assigned(spa_get_dsl(spa),
spa_first_txg(spa));
(void) dmu_objset_find(spa_name(spa),
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
- spa->spa_log_state = SPA_LOG_GOOD;
+ spa->spa_claiming = B_FALSE;
+
+ spa_set_log_state(spa, SPA_LOG_GOOD);
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
/*
- * Wait for all claims to sync.
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by either zil_check_log_chain()
+ * (invoked from spa_check_logs()) or zil_claim() above.
*/
- txg_wait_synced(spa->spa_dsl_pool, 0);
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
* If the config cache is stale, or we have uninitialized
* metaslabs (see spa_vdev_add()), then update the config.
*
- * If spa_load_verbatim is true, trust the current
+ * If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
+ state == SPA_LOAD_IMPORT ||
+ state == SPA_LOAD_RECOVER ||
+ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
need_update = B_TRUE;
for (int c = 0; c < rvd->vdev_children; c++)
@@ -1599,19 +2247,100 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* Check all DTLs to see if anything needs resilvering.
*/
- if (vdev_resilver_needed(rvd, NULL, NULL))
+ if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(rvd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ /*
+ * Delete any inconsistent datasets.
+ */
+ (void) dmu_objset_find(spa_name(spa),
+ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Clean up any stale temporary dataset userrefs.
+ */
+ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
}
- error = 0;
-out:
- spa->spa_minref = refcount_count(&spa->spa_refcount);
- if (error && error != EBADF)
- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
- spa->spa_load_state = SPA_LOAD_NONE;
- spa->spa_ena = 0;
+ return (0);
+}
- return (error);
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+ int mode = spa->spa_mode;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+
+ spa->spa_load_max_txg--;
+
+ spa_activate(spa, mode);
+ spa_async_suspend(spa);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+ uint64_t max_request, int rewind_flags)
+{
+ nvlist_t *config = NULL;
+ int load_error, rewind_error;
+ uint64_t safe_rewind_txg;
+ uint64_t min_txg;
+
+ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ spa->spa_load_max_txg = spa->spa_load_txg;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ spa->spa_load_max_txg = max_request;
+ }
+
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+ mosconfig);
+ if (load_error == 0)
+ return (0);
+
+ if (spa->spa_root_vdev != NULL)
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ nvlist_free(config);
+ return (load_error);
+ }
+
+ /* Price of rolling back is discarding txgs, including log */
+ if (state == SPA_LOAD_RECOVER)
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+
+ spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ TXG_INITIAL : safe_rewind_txg;
+
+ /*
+ * Continue as long as we're finding errors, we're still within
+ * the acceptable rewind range, and we're still finding uberblocks
+ */
+ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ if (spa->spa_load_max_txg < safe_rewind_txg)
+ spa->spa_extreme_rewind = B_TRUE;
+ rewind_error = spa_load_retry(spa, state, mosconfig);
+ }
+
+ spa->spa_extreme_rewind = B_FALSE;
+ spa->spa_load_max_txg = UINT64_MAX;
+
+ if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ spa_config_set(spa, config);
+
+ return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
}
/*
@@ -1627,11 +2356,14 @@ out:
* ambiguous state.
*/
static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ nvlist_t **config)
{
spa_t *spa;
+ spa_load_state_t state = SPA_LOAD_OPEN;
int error;
int locked = B_FALSE;
+ int firstopen = B_FALSE;
*spapp = NULL;
@@ -1651,11 +2383,24 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
mutex_exit(&spa_namespace_lock);
return (ENOENT);
}
+
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ zpool_rewind_policy_t policy;
+
+ firstopen = B_TRUE;
+
+ zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
+ &policy);
+ if (policy.zrp_request & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
spa_activate(spa, spa_mode_global);
- error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+ error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+ policy.zrp_request);
if (error == EBADF) {
/*
@@ -1680,38 +2425,66 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
* information: the state of each vdev after the
* attempted vdev_open(). Return this to the user.
*/
- if (config != NULL && spa->spa_root_vdev != NULL)
- *config = spa_config_generate(spa, NULL, -1ULL,
- B_TRUE);
+ if (config != NULL && spa->spa_config) {
+ VERIFY(nvlist_dup(spa->spa_config, config,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist(*config,
+ ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
spa_unload(spa);
spa_deactivate(spa);
- spa->spa_last_open_failed = B_TRUE;
+ spa->spa_last_open_failed = error;
if (locked)
mutex_exit(&spa_namespace_lock);
*spapp = NULL;
return (error);
- } else {
- spa->spa_last_open_failed = B_FALSE;
}
}
spa_open_ref(spa, tag);
- if (locked)
+ if (config != NULL)
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ /*
+ * If we've recovered the pool, pass back any information we
+ * gathered while doing the load.
+ */
+ if (state == SPA_LOAD_RECOVER) {
+ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+
+ if (locked) {
+ spa->spa_last_open_failed = 0;
+ spa->spa_last_ubsync_txg = 0;
+ spa->spa_load_txg = 0;
mutex_exit(&spa_namespace_lock);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ if (firstopen)
+ zvol_create_minors(pool);
+#endif
+#endif
+ }
*spapp = spa;
- if (config != NULL)
- *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
return (0);
}
int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+ nvlist_t **config)
+{
+ return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
spa_open(const char *name, spa_t **spapp, void *tag)
{
- return (spa_open_common(name, spapp, tag, NULL));
+ return (spa_open_common(name, spapp, tag, NULL, NULL));
}
/*
@@ -1782,7 +2555,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
if (spa_spare_exists(guid, &pool, NULL) &&
pool != 0ULL) {
VERIFY(nvlist_lookup_uint64_array(
- spares[i], ZPOOL_CONFIG_STATS,
+ spares[i], ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
vs->vs_state = VDEV_STATE_CANT_OPEN;
vs->vs_aux = VDEV_AUX_SPARED;
@@ -1839,7 +2612,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
ASSERT(vd != NULL);
VERIFY(nvlist_lookup_uint64_array(l2cache[i],
- ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
vdev_get_stats(vd, vs);
}
}
@@ -1852,7 +2626,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_t *spa;
*config = NULL;
- error = spa_open_common(name, &spa, FTAG, config);
+ error = spa_open_common(name, &spa, FTAG, NULL, config);
if (spa != NULL) {
/*
@@ -1863,6 +2637,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
if (*config != NULL) {
+ uint64_t loadtimes[2];
+
+ loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+ loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+ VERIFY(nvlist_add_uint64_array(*config,
+ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
VERIFY(nvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
spa_get_errlog_size(spa)) == 0);
@@ -2092,11 +2873,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
- int c, error = 0;
+ int error = 0;
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version;
+ uint64_t version, obj;
/*
* If this pool already exists, return failure.
@@ -2112,11 +2893,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
+ spa = spa_add(pool, NULL, altroot);
spa_activate(spa, spa_mode_global);
- spa->spa_uberblock.ub_txg = txg - 1;
-
if (props && (error = spa_prop_validate(spa, props))) {
spa_deactivate(spa);
spa_remove(spa);
@@ -2128,6 +2907,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
&version) != 0)
version = SPA_VERSION;
ASSERT(version <= SPA_VERSION);
+
+ spa->spa_first_txg = txg;
+ spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
@@ -2154,9 +2936,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_init(rvd->vdev_child[c], txg);
- vdev_config_dirty(rvd);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_metaslab_set_size(rvd->vdev_child[c]);
+ vdev_expand(rvd->vdev_child[c], txg);
+ }
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2202,6 +2985,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
spa->spa_meta_objset = dp->dp_meta_objset;
+ /*
+ * Create DDTs (dedup tables).
+ */
+ ddt_create(spa);
+
+ spa_update_dspace(spa);
+
tx = dmu_tx_create_assigned(dp, txg);
/*
@@ -2217,6 +3007,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
cmn_err(CE_PANIC, "failed to add pool config");
}
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+ sizeof (uint64_t), 1, &version, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool version");
+ }
+
/* Newly created pools with the right version are always deflated. */
if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
spa->spa_deflate = TRUE;
@@ -2228,20 +3024,20 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
}
/*
- * Create the deferred-free bplist object. Turn off compression
+ * Create the deferred-free bpobj. Turn off compression
* because sync-to-convergence takes longer if the blocksize
* keeps changing.
*/
- spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
- 1 << 14, tx);
- dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, obj,
ZIO_COMPRESS_OFF, tx);
-
if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add bplist");
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+ sizeof (uint64_t), 1, &obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bpobj");
}
+ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+ spa->spa_meta_objset, obj));
/*
* Create the pool's history object.
@@ -2255,9 +3051,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_sync_props(spa, props, CRED(), tx);
+ spa_sync_props(spa, props, tx);
}
dmu_tx_commit(tx);
@@ -2275,6 +3073,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
+ spa_history_log_version(spa, LOG_POOL_CREATE);
spa->spa_minref = refcount_count(&spa->spa_refcount);
@@ -2283,32 +3082,39 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
return (0);
}
-#ifdef sun
+#if defined(sun)
#ifdef _KERNEL
/*
- * Build a "root" vdev for a top level vdev read in from a rootpool
- * device label.
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
*/
-static void
-spa_build_rootpool_config(nvlist_t *config)
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+static nvlist_t *
+spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
{
+ nvlist_t *config;
nvlist_t *nvtop, *nvroot;
uint64_t pgid;
+ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+ return (NULL);
+
/*
* Add this top-level vdev to the child array.
*/
- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
- == 0);
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
- == 0);
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
/*
* Put this pool's top-level vdevs into a root vdev.
*/
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
- == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
@@ -2320,127 +3126,40 @@ spa_build_rootpool_config(nvlist_t *config)
*/
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
+ return (config);
}
/*
- * Get the root pool information from the root disk, then import the root pool
- * during the system boot up time.
- */
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
-int
-spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
- uint64_t *besttxg)
-{
- nvlist_t *config;
- uint64_t txg;
- int error;
-
- if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
- return (error);
-
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
-
- if (bestconf != NULL)
- *bestconf = config;
- else
- nvlist_free(config);
- *besttxg = txg;
- return (0);
-}
-
-boolean_t
-spa_rootdev_validate(nvlist_t *nv)
-{
- uint64_t ival;
-
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Given the boot device's physical path or devid, check if the device
- * is in a valid state. If so, return the configuration from the vdev
- * label.
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
*/
-int
-spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
+static void
+spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
{
- nvlist_t *conf = NULL;
- uint64_t txg = 0;
- nvlist_t *nvtop, **child;
- char *type;
- char *bootpath = NULL;
- uint_t children, c;
- char *tmp;
- int error;
-
- if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
- *tmp = '\0';
- if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
- cmn_err(CE_NOTE, "error reading device label");
- return (error);
- }
- if (txg == 0) {
- cmn_err(CE_NOTE, "this device is detached");
- nvlist_free(conf);
- return (EINVAL);
- }
-
- VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
- &nvtop) == 0);
- VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
- if (strcmp(type, VDEV_TYPE_DISK) == 0) {
- if (spa_rootdev_validate(nvtop)) {
- goto out;
- } else {
- nvlist_free(conf);
- return (EINVAL);
- }
- }
+ if (vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t *label;
+ uint64_t label_txg;
- ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
+ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+ &label) != 0)
+ return;
- VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
+ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &label_txg) == 0);
- /*
- * Go thru vdevs in the mirror to see if the given device
- * has the most recent txg. Only the device with the most
- * recent txg has valid information and should be booted.
- */
- for (c = 0; c < children; c++) {
- char *cdevid, *cpath;
- uint64_t tmptxg;
-
- cpath = NULL;
- cdevid = NULL;
- if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
- &cpath) != 0 && nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_DEVID, &cdevid) != 0)
- return (EINVAL);
- if ((spa_check_rootconf(cpath, cdevid, NULL,
- &tmptxg) == 0) && (tmptxg > txg)) {
- txg = tmptxg;
- VERIFY(nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_PATH, &bootpath) == 0);
+ /*
+ * Do we have a better boot device?
+ */
+ if (label_txg > *txg) {
+ *txg = label_txg;
+ *avd = vd;
}
+ nvlist_free(label);
}
-
- /* Does the best device match the one we've booted from? */
- if (bootpath) {
- cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
- return (EINVAL);
- }
-out:
- *bestconf = conf;
- return (0);
}
/*
@@ -2458,24 +3177,35 @@ out:
int
spa_import_rootpool(char *devpath, char *devid)
{
- nvlist_t *conf = NULL;
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t guid, txg;
char *pname;
int error;
- spa_t *spa;
/*
- * Get the vdev pathname and configuation from the most
- * recently updated vdev (highest txg).
+ * Read the label from the boot device and generate a configuration.
*/
- if (error = spa_get_rootconf(devpath, devid, &conf))
- goto msg_out;
-
- /*
- * Add type "root" vdev to the config.
- */
- spa_build_rootpool_config(conf);
+ config = spa_generate_rootconf(devpath, devid, &guid);
+#if defined(_OBP) && defined(_KERNEL)
+ if (config == NULL) {
+ if (strstr(devpath, "/iscsi/ssd") != NULL) {
+ /* iscsi boot */
+ get_iscsi_bootpath_phy(devpath);
+ config = spa_generate_rootconf(devpath, devid, &guid);
+ }
+ }
+#endif
+ if (config == NULL) {
+ cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+ devpath);
+ return (EIO);
+ }
- VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pname)) != NULL) {
@@ -2486,71 +3216,90 @@ spa_import_rootpool(char *devpath, char *devid)
spa_remove(spa);
}
- spa = spa_add(pname, NULL);
+ spa = spa_add(pname, config, NULL);
spa->spa_is_root = B_TRUE;
- spa->spa_load_verbatim = B_TRUE;
-
- VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0);
- mutex_exit(&spa_namespace_lock);
-
- nvlist_free(conf);
- return (0);
-
-msg_out:
- cmn_err(CE_NOTE, "\n"
- " *************************************************** \n"
- " * This device is not bootable! * \n"
- " * It is either offlined or detached or faulted. * \n"
- " * Please try to boot from a different device. * \n"
- " *************************************************** ");
-
- return (error);
-}
-#endif
-#endif /* sun */
-
-/*
- * Take a pool and insert it into the namespace as if it had been loaded at
- * boot.
- */
-int
-spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
-{
- spa_t *spa;
- char *altroot = NULL;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
- mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
mutex_exit(&spa_namespace_lock);
- return (EEXIST);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
}
- (void) nvlist_lookup_string(props,
- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
-
- spa->spa_load_verbatim = B_TRUE;
-
- VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ /*
+ * Get the boot vdev.
+ */
+ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+ (u_longlong_t)guid);
+ error = ENOENT;
+ goto out;
+ }
- if (props != NULL)
- spa_configfile_set(spa, props, B_FALSE);
+ /*
+ * Determine if there is a better boot device.
+ */
+ avd = bvd;
+ spa_alt_rootvdev(rvd, &avd, &txg);
+ if (avd != bvd) {
+ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+ "try booting from '%s'", avd->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ /*
+ * If the boot device is part of a spare vdev then ensure that
+ * we're booting off the active spare.
+ */
+ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ !bvd->vdev_isspare) {
+ cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+ "try booting from '%s'",
+ bvd->vdev_parent->
+ vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+ error = 0;
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+out:
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&spa_namespace_lock);
- return (0);
+ nvlist_free(config);
+ return (error);
}
+#endif
+#endif /* sun */
+
/*
* Import a non-root pool into the system.
*/
int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
{
spa_t *spa;
char *altroot = NULL;
+ spa_load_state_t state = SPA_LOAD_IMPORT;
+ zpool_rewind_policy_t policy;
+ uint64_t mode = spa_mode_global;
+ uint64_t readonly = B_FALSE;
int error;
nvlist_t *nvroot;
nvlist_t **spares, **l2cache;
@@ -2560,7 +3309,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
* If a pool with this name exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
- if ((spa = spa_lookup(pool)) != NULL) {
+ if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
return (EEXIST);
}
@@ -2570,20 +3319,57 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
- spa_activate(spa, spa_mode_global);
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+ if (readonly)
+ mode = FREAD;
+ spa = spa_add(pool, config, altroot);
+ spa->spa_import_flags = flags;
+
+ /*
+ * Verbatim import - Take a pool and insert it into the namespace
+ * as if it had been loaded at boot.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+
+ mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+
+ return (0);
+ }
+
+ spa_activate(spa, mode);
/*
* Don't start async tasks until we know everything is healthy.
*/
spa_async_suspend(spa);
+ zpool_get_rewind_policy(config, &policy);
+ if (policy.zrp_request & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
/*
* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
* because the user-supplied config is actually the one to trust when
* doing an import.
*/
- error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+ error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+ policy.zrp_request);
+
+ /*
+ * Propagate anything learned while loading the pool and pass it
+ * back to caller (i.e. rewind info, missing devices, etc).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -2660,6 +3446,14 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa->spa_l2cache.sav_sync = B_TRUE;
}
+ /*
+ * Check for any removed devices.
+ */
+ if (spa->spa_autoreplace) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+
if (spa_writeable(spa)) {
/*
* Update the config cache to include the newly-imported pool.
@@ -2667,17 +3461,23 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
+ /*
+ * It's possible that the pool was expanded while it was exported.
+ * We kick off an async task to handle this for us.
+ */
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+
mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ zvol_create_minors(pool);
+#endif
+#endif
return (0);
}
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define TRYIMPORT_NAME "$import"
-
nvlist_t *
spa_tryimport(nvlist_t *tryconfig)
{
@@ -2697,7 +3497,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Create and initialize the spa structure.
*/
mutex_enter(&spa_namespace_lock);
- spa = spa_add(TRYIMPORT_NAME, NULL);
+ spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa_activate(spa, FREAD);
/*
@@ -2705,7 +3505,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Pass TRUE for mosconfig because the user-supplied config
* is actually the one to trust when doing an import.
*/
- error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
/*
* If 'tryconfig' was at least parsable, return the current config.
@@ -2850,7 +3650,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
- spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_final_txg = spa_last_synced_txg(spa) +
+ TXG_DEFER_SIZE + 1;
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
}
@@ -2920,13 +3721,15 @@ spa_reset(char *pool)
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, id;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
@@ -2961,9 +3764,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* Transfer each new top-level vdev from vd to rvd.
*/
for (int c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
+ tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
@@ -3020,15 +3833,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
- uint64_t txg, open_txg;
+ uint64_t txg, dtl_max_txg;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
- dmu_tx_t *tx;
char *oldvdpath, *newvdpath;
int newvd_isspare;
int error;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3078,7 +3892,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* spares.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_child[1] == oldvd &&
+ oldvd->vdev_isspare &&
!spa_has_spare(spa, newvd->vdev_guid))
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -3090,23 +3904,24 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* the same (spare replaces spare, non-spare replaces
* non-spare).
*/
- if (pvd->vdev_ops == &vdev_replacing_ops)
+ if (pvd->vdev_ops == &vdev_replacing_ops &&
+ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops == &vdev_spare_ops &&
- newvd->vdev_isspare != oldvd->vdev_isspare)
+ } else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops != &vdev_spare_ops &&
- newvd->vdev_isspare)
+ }
+
+ if (newvd->vdev_isspare)
pvops = &vdev_spare_ops;
else
pvops = &vdev_replacing_ops;
}
/*
- * Compare the new device size with the replaceable/attachable
- * device size.
+ * Make sure the new device is big enough.
*/
- if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -3132,6 +3947,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
+ /* mark the device being resilvered */
+ newvd->vdev_resilvering = B_TRUE;
+
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@@ -3148,14 +3966,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_remove_child(newrootvd, newvd);
newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
vdev_add_child(pvd, newvd);
- /*
- * If newvd is smaller than oldvd, but larger than its rsize,
- * the addition of newvd may have decreased our parent's asize.
- */
- pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
tvd = newvd->vdev_top;
ASSERT(pvd->vdev_top == tvd);
ASSERT(tvd->vdev_parent == rvd);
@@ -3163,13 +3976,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_config_dirty(tvd);
/*
- * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
- * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+ * for any dmu_sync-ed blocks. It will propagate upward when
+ * spa_vdev_exit() calls vdev_dtl_reassess().
*/
- open_txg = txg + TXG_CONCURRENT_STATES - 1;
+ dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING,
- TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@@ -3185,27 +3999,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_dirty(tvd, VDD_DTL, newvd, txg);
- (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+ /*
+ * Restart the resilver
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+ /*
+ * Commit the config
+ */
+ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
- if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
- spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
- CRED(), "%s vdev=%s %s vdev=%s",
- replacing && newvd_isspare ? "spare in" :
- replacing ? "replace" : "attach", newvdpath,
- replacing ? "for" : "to", oldvdpath);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+ spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
+ "%s vdev=%s %s vdev=%s",
+ replacing && newvd_isspare ? "spare in" :
+ replacing ? "replace" : "attach", newvdpath,
+ replacing ? "for" : "to", oldvdpath);
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
- /*
- * Kick off a resilver to update newvd.
- */
- VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
return (0);
}
@@ -3224,7 +4038,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
uint64_t unspare_guid;
- size_t len;
+ char *vdpath;
+
+ ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
@@ -3255,18 +4071,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/*
- * If replace_done is specified, only remove this device if it's
- * the first child of a replacing vdev. For the 'spare' vdev, either
- * disk can be removed.
+ * Only 'replacing' or 'spare' vdevs can be replaced.
*/
- if (replace_done) {
- if (pvd->vdev_ops == &vdev_replacing_ops) {
- if (vd->vdev_id != 0)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- } else if (pvd->vdev_ops != &vdev_spare_ops) {
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- }
- }
+ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
spa_version(spa) >= SPA_VERSION_SPARES);
@@ -3293,16 +4102,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* check to see if we changed the original vdev's path to have "/old"
* at the end in spa_vdev_attach(). If so, undo that change now.
*/
- if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
- pvd->vdev_child[0]->vdev_path != NULL &&
- pvd->vdev_child[1]->vdev_path != NULL) {
- ASSERT(pvd->vdev_child[1] == vd);
- cvd = pvd->vdev_child[0];
- len = strlen(vd->vdev_path);
- if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
- strcmp(cvd->vdev_path + len, "/old") == 0) {
- spa_strfree(cvd->vdev_path);
- cvd->vdev_path = spa_strdup(vd->vdev_path);
+ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+ vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path);
+
+ for (int c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+
+ if (cvd == vd || cvd->vdev_path == NULL)
+ continue;
+
+ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+ strcmp(cvd->vdev_path + len, "/old") == 0) {
+ spa_strfree(cvd->vdev_path);
+ cvd->vdev_path = spa_strdup(vd->vdev_path);
+ break;
+ }
}
}
@@ -3312,7 +4127,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* active spare list for the pool.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+ vd->vdev_id == 0 &&
+ pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
/*
@@ -3334,7 +4150,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
/*
* Remember one of the remaining children so we can get tvd below.
*/
- cvd = pvd->vdev_child[0];
+ cvd = pvd->vdev_child[pvd->vdev_children - 1];
/*
* If we need to remove the remaining child from the list of hot spares,
@@ -3350,14 +4166,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid;
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ cvd->vdev_unspare = B_TRUE;
}
/*
* If the parent mirror/replacing vdev only has one child,
* the parent is no longer needed. Remove it from the tree.
*/
- if (pvd->vdev_children == 1)
+ if (pvd->vdev_children == 1) {
+ if (pvd->vdev_ops == &vdev_spare_ops)
+ cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
+ cvd->vdev_resilvering = B_FALSE;
+ }
+
/*
* We don't set tvd until now because the parent we just removed
@@ -3372,12 +4194,16 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_propagate_state(cvd);
/*
- * If the device we just detached was smaller than the others, it may be
- * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
- * can't fail because the existing metaslabs are already in core, so
- * there's nothing to read from disk.
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
*/
- VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
vdev_config_dirty(tvd);
@@ -3387,6 +4213,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* But first make sure we're not on any *other* txg's DTL list, to
* prevent vd from being accessed after it's freed.
*/
+ vdpath = spa_strdup(vd->vdev_path);
for (int t = 0; t < TXG_SIZE; t++)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
vd->vdev_detached = B_TRUE;
@@ -3394,31 +4221,335 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ /* hang on to the spa before we release the lock */
+ spa_open_ref(spa, FTAG);
+
error = spa_vdev_exit(spa, vd, txg, 0);
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+ "vdev=%s", vdpath);
+ spa_strfree(vdpath);
+
/*
* If this was the removal of the original device in a hot spare vdev,
* then we want to go through and remove the device from the hot spare
* list of every other pool.
*/
if (unspare) {
- spa_t *myspa = spa;
- spa = NULL;
+ spa_t *altspa = NULL;
+
mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa->spa_state != POOL_STATE_ACTIVE)
- continue;
- if (spa == myspa)
+ while ((altspa = spa_next(altspa)) != NULL) {
+ if (altspa->spa_state != POOL_STATE_ACTIVE ||
+ altspa == spa)
continue;
- spa_open_ref(spa, FTAG);
+
+ spa_open_ref(altspa, FTAG);
mutex_exit(&spa_namespace_lock);
- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
+ spa_close(altspa, FTAG);
}
mutex_exit(&spa_namespace_lock);
+
+ /* search the rest of the vdevs for spares to remove */
+ spa_vdev_resilver_done(spa);
}
+ /* all done with the spa; OK to release */
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp)
+{
+ int error = 0;
+ uint64_t txg, *glist;
+ spa_t *newspa;
+ uint_t c, children, lastlog;
+ nvlist_t **child, *nvl, *tmp;
+ dmu_tx_t *tx;
+ char *altroot = NULL;
+ vdev_t *rvd, **vml = NULL; /* vdev modify list */
+ boolean_t activate_slog;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ /* clear the log and flush everything up to now */
+ activate_slog = spa_passivate_log(spa);
+ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ error = spa_offline_log(spa);
+ txg = spa_vdev_config_enter(spa);
+
+ if (activate_slog)
+ spa_activate_log(spa);
+
+ if (error != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ /* check new spa name before going any further */
+ if (spa_lookup(newname) != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+ /*
+ * scan through all the children to ensure they're all mirrors
+ */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* first, check to ensure we've got the right child count */
+ rvd = spa->spa_root_vdev;
+ lastlog = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ /* don't count the holes & logs as children */
+ if (vd->vdev_islog || vd->vdev_ishole) {
+ if (lastlog == 0)
+ lastlog = c;
+ continue;
+ }
+
+ lastlog = 0;
+ }
+ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* next, ensure no spare or cache devices are part of the split */
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+ /* then, loop over each vdev and validate it */
+ for (c = 0; c < children; c++) {
+ uint64_t is_hole = 0;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+
+ if (is_hole != 0) {
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ continue;
+ } else {
+ error = EINVAL;
+ break;
+ }
+ }
+
+ /* which disk is going to be split? */
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ &glist[c]) != 0) {
+ error = EINVAL;
+ break;
+ }
+
+ /* look it up in the spa */
+ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ if (vml[c] == NULL) {
+ error = ENODEV;
+ break;
+ }
+
+ /* make sure there's nothing stopping the split */
+ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ vml[c]->vdev_islog ||
+ vml[c]->vdev_ishole ||
+ vml[c]->vdev_isspare ||
+ vml[c]->vdev_isl2cache ||
+ !vdev_writeable(vml[c]) ||
+ vml[c]->vdev_children != 0 ||
+ vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ error = EINVAL;
+ break;
+ }
+
+ if (vdev_dtl_required(vml[c])) {
+ error = EBUSY;
+ break;
+ }
+
+ /* we need certain info from the top level */
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ vml[c]->vdev_top->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ vml[c]->vdev_top->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ vml[c]->vdev_top->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ vml[c]->vdev_top->vdev_ashift) == 0);
+ }
+
+ if (error != 0) {
+ kmem_free(vml, children * sizeof (vdev_t *));
+ kmem_free(glist, children * sizeof (uint64_t));
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* stop writers from using the disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_TRUE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ /*
+ * Temporarily record the splitting vdevs in the spa config. This
+ * will disappear once the config is regenerated.
+ */
+ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ glist, children) == 0);
+ kmem_free(glist, children * sizeof (uint64_t));
+
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ nvl) == 0);
+ mutex_exit(&spa->spa_props_lock);
+ spa->spa_config_splitting = nvl;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ /* configure and create the new pool */
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ spa->spa_config_txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_generate_guid(NULL)) == 0);
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+ /* add the new pool to the namespace */
+ newspa = spa_add(newname, config, altroot);
+ newspa->spa_config_txg = spa->spa_config_txg;
+ spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+ /* release the spa config lock, retaining the namespace lock */
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 1);
+
+ spa_activate(newspa, spa_mode_global);
+ spa_async_suspend(newspa);
+
+#ifndef sun
+ /* mark that we are creating new spa by splitting */
+ newspa->spa_splitting_newspa = B_TRUE;
+#endif
+ /* create the new pool from the disks of the original pool */
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+#ifndef sun
+ newspa->spa_splitting_newspa = B_FALSE;
+#endif
+ if (error)
+ goto out;
+
+ /* if that worked, generate a real config for the new pool */
+ if (newspa->spa_root_vdev != NULL) {
+ VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ B_TRUE));
+ }
+
+ /* set the props */
+ if (props != NULL) {
+ spa_configfile_set(newspa, props, B_FALSE);
+ error = spa_prop_set(newspa, props);
+ if (error)
+ goto out;
+ }
+
+ /* flush everything */
+ txg = spa_vdev_config_enter(newspa);
+ vdev_config_dirty(newspa->spa_root_vdev);
+ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 2);
+
+ spa_async_resume(newspa);
+
+ /* finally, update the original pool's config */
+ txg = spa_vdev_config_enter(spa);
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0)
+ dmu_tx_abort(tx);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ vdev_split(vml[c]);
+ if (error == 0)
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+ spa, tx, "vdev=%s",
+ vml[c]->vdev_path);
+ vdev_free(vml[c]);
+ }
+ }
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa->spa_config_splitting = NULL;
+ nvlist_free(nvl);
+ if (error == 0)
+ dmu_tx_commit(tx);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 3);
+
+ /* split is complete; log a history record */
+ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+ "split new pool %s from pool %s", newname, spa_name(spa));
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+
+ /* if we're not going to mount the filesystems in userland, export */
+ if (exp)
+ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ B_FALSE, B_FALSE);
+
+ return (error);
+
+out:
+ spa_unload(newspa);
+ spa_deactivate(newspa);
+ spa_remove(newspa);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /* re-online all offlined disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_FALSE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+
+ kmem_free(vml, children * sizeof (vdev_t *));
return (error);
}
@@ -3464,19 +4595,118 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
/*
+ * Evacuate the device.
+ */
+static int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+ uint64_t txg;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as writer
+ * since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ if (vd->vdev_islog) {
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_offline_log(spa);
+ } else {
+ error = ENOTSUP;
+ }
+
+ if (error)
+ return (error);
+
+ /*
+ * The evacuation succeeded. Remove any remaining MOS metadata
+ * associated with this vdev, and wait for these changes to sync.
+ */
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+ txg = spa_vdev_config_enter(spa);
+ vd->vdev_removing = B_TRUE;
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_config_dirty(vd);
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+static void
+spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t id = vd->vdev_id;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Only remove any devices which are empty.
+ */
+ if (vd->vdev_stat.vs_alloc != 0)
+ return;
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
+ vdev_free(vd);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+
+/*
* Remove a device from the pool. Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
{
vdev_t *vd;
+ metaslab_group_t *mg;
nvlist_t **spares, **l2cache, *nv;
- uint_t nspares, nl2cache;
uint64_t txg = 0;
+ uint_t nspares, nl2cache;
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ ASSERT(spa_writeable(spa));
+
if (!locked)
txg = spa_vdev_enter(spa);
@@ -3509,6 +4739,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
spa_load_l2cache(spa);
spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * XXX - Once we have bp-rewrite this should
+ * become the common case.
+ */
+
+ mg = vd->vdev_mg;
+
+ /*
+ * Stop allocating from this vdev.
+ */
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * Attempt to evacuate the vdev.
+ */
+ error = spa_vdev_remove_evacuate(spa, vd);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /*
+ * If we couldn't evacuate the vdev, unwind.
+ */
+ if (error) {
+ metaslab_group_activate(mg);
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /*
+ * Clean up the vdev namespace.
+ */
+ spa_vdev_remove_from_namespace(spa, vd);
+
} else if (vd != NULL) {
/*
* Normal vdevs cannot be removed (yet).
@@ -3535,22 +4808,29 @@ static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
{
vdev_t *newvd, *oldvd;
- int c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
if (oldvd != NULL)
return (oldvd);
}
/*
- * Check for a completed replacement.
+ * Check for a completed replacement. We always consider the first
+ * vdev in the list to be the oldest vdev, and the last one to be
+ * the newest (see spa_vdev_attach() for how that works). In
+ * the case where the newest vdev is faulted, we will not automatically
+ * remove it after a resilver completes. This is OK as it will require
+ * user intervention to determine which disk the admin wishes to keep.
*/
- if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ if (vd->vdev_ops == &vdev_replacing_ops) {
+ ASSERT(vd->vdev_children > 1);
+
+ newvd = vd->vdev_child[vd->vdev_children - 1];
oldvd = vd->vdev_child[0];
- newvd = vd->vdev_child[1];
if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
!vdev_dtl_required(oldvd))
return (oldvd);
}
@@ -3558,15 +4838,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
/*
* Check for a completed resilver with the 'unspare' flag set.
*/
- if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
- newvd = vd->vdev_child[0];
- oldvd = vd->vdev_child[1];
+ if (vd->vdev_ops == &vdev_spare_ops) {
+ vdev_t *first = vd->vdev_child[0];
+ vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+ if (last->vdev_unspare) {
+ oldvd = first;
+ newvd = last;
+ } else if (first->vdev_unspare) {
+ oldvd = last;
+ newvd = first;
+ } else {
+ oldvd = NULL;
+ }
- if (newvd->vdev_unspare &&
+ if (oldvd != NULL &&
vdev_dtl_empty(newvd, DTL_MISSING) &&
- !vdev_dtl_required(oldvd)) {
- newvd->vdev_unspare = 0;
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
return (oldvd);
+
+ /*
+ * If there are more than two spares attached to a disk,
+ * and those spares are not required, then we want to
+ * attempt to free them up now so that they can be used
+ * by other pools. Once we're back down to a single
+ * disk+spare, we stop removing them.
+ */
+ if (vd->vdev_children > 2) {
+ newvd = vd->vdev_child[1];
+
+ if (newvd->vdev_isspare && last->vdev_isspare &&
+ vdev_dtl_empty(last, DTL_MISSING) &&
+ vdev_dtl_empty(last, DTL_OUTAGE) &&
+ !vdev_dtl_required(newvd))
+ return (newvd);
}
}
@@ -3593,9 +4899,9 @@ spa_vdev_resilver_done(spa_t *spa)
* we need to detach the parent's first child (the original hot
* spare) as well.
*/
- if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+ ppvd->vdev_children == 2) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
- ASSERT(ppvd->vdev_children == 2);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -3610,36 +4916,43 @@ spa_vdev_resilver_done(spa_t *spa)
}
/*
- * Update the stored path or FRU for this vdev. Dirty the vdev configuration,
- * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev.
*/
int
spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
boolean_t ispath)
{
vdev_t *vd;
- uint64_t txg;
+ boolean_t sync = B_FALSE;
- txg = spa_vdev_enter(spa);
+ ASSERT(spa_writeable(spa));
+
+ spa_vdev_state_enter(spa, SCL_ALL);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ return (spa_vdev_state_exit(spa, NULL, ENOENT));
if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
if (ispath) {
- spa_strfree(vd->vdev_path);
- vd->vdev_path = spa_strdup(value);
+ if (strcmp(value, vd->vdev_path) != 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(value);
+ sync = B_TRUE;
+ }
} else {
- if (vd->vdev_fru != NULL)
+ if (vd->vdev_fru == NULL) {
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ } else if (strcmp(value, vd->vdev_fru) != 0) {
spa_strfree(vd->vdev_fru);
- vd->vdev_fru = spa_strdup(value);
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ }
}
- vdev_config_dirty(vd->vdev_top);
-
- return (spa_vdev_exit(spa, NULL, txg, 0));
+ return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
}
int
@@ -3656,40 +4969,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
/*
* ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
* ==========================================================================
*/
int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (EBUSY);
+ return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
- if ((uint_t)type >= POOL_SCRUB_TYPES)
+ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
return (ENOTSUP);
/*
* If a resilver was requested, but there is no DTL on a
* writeable leaf device, we have nothing to do.
*/
- if (type == POOL_SCRUB_RESILVER &&
+ if (func == POOL_SCAN_RESILVER &&
!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
return (0);
}
- if (type == POOL_SCRUB_EVERYTHING &&
- spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
- spa->spa_dsl_pool->dp_scrub_isresilver)
- return (EBUSY);
-
- if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
- return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
- } else if (type == POOL_SCRUB_NONE) {
- return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
- } else {
- return (EINVAL);
- }
+ return (dsl_scan(spa->spa_dsl_pool, func));
}
/*
@@ -3702,7 +5013,8 @@ static void
spa_async_remove(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_remove_wanted) {
- vd->vdev_remove_wanted = 0;
+ vd->vdev_remove_wanted = B_FALSE;
+ vd->vdev_delayed_close = B_FALSE;
vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
/*
@@ -3726,7 +5038,7 @@ static void
spa_async_probe(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = 0;
+ vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
}
@@ -3735,6 +5047,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd)
}
static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath;
+
+ if (!spa->spa_autoexpand)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+}
+
+static void
spa_async_thread(void *arg)
{
spa_t *spa = arg;
@@ -3751,16 +5094,31 @@ spa_async_thread(void *arg)
* See if the config needs to be updated.
*/
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ uint64_t old_space, new_space;
+
mutex_enter(&spa_namespace_lock);
+ old_space = metaslab_class_get_space(spa_normal_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ new_space = metaslab_class_get_space(spa_normal_class(spa));
mutex_exit(&spa_namespace_lock);
+
+ /*
+ * If the pool grew as a result of the config update,
+ * then log an internal history event.
+ */
+ if (new_space != old_space) {
+ spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
+ spa, NULL,
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), new_space, new_space - old_space);
+ }
}
/*
* See if any devices need to be marked REMOVED.
*/
if (tasks & SPA_ASYNC_REMOVE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_remove(spa, spa->spa_root_vdev);
for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3769,11 +5127,17 @@ spa_async_thread(void *arg)
(void) spa_vdev_state_exit(spa, NULL, 0);
}
+ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_async_autoexpand(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
/*
* See if any devices need to be probed.
*/
if (tasks & SPA_ASYNC_PROBE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -3788,7 +5152,7 @@ spa_async_thread(void *arg)
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER)
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
/*
* Let the world know that we're done.
@@ -3834,6 +5198,7 @@ spa_async_dispatch(spa_t *spa)
void
spa_async_request(spa_t *spa, int task)
{
+ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks |= task;
mutex_exit(&spa->spa_async_lock);
@@ -3845,37 +5210,22 @@ spa_async_request(spa_t *spa, int task)
* ==========================================================================
*/
-static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
- bplist_t *bpl = &spa->spa_sync_bplist;
- dmu_tx_t *tx;
- blkptr_t blk;
- uint64_t itor = 0;
- zio_t *zio;
- int error;
- uint8_t c = 1;
-
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- ASSERT(blk.blk_birth < txg);
- zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
- }
-
- error = zio_wait(zio);
- ASSERT3U(error, ==, 0);
+ bpobj_t *bpo = arg;
+ bpobj_enqueue(bpo, bp, tx);
+ return (0);
+}
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- bplist_vacate(bpl, tx);
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zio_t *zio = arg;
- /*
- * Pre-dirty the first block so we sync to convergence faster.
- * (Usually only the first block is needed.)
- */
- dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
- dmu_tx_commit(tx);
+ zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+ zio->io_flags));
+ return (0);
}
static void
@@ -3942,7 +5292,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
- B_FALSE, B_FALSE, B_TRUE);
+ B_FALSE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
sav->sav_count) == 0);
for (i = 0; i < sav->sav_count; i++)
@@ -3982,7 +5332,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
* Set zpool properties.
*/
static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
objset_t *mos = spa->spa_meta_objset;
@@ -4023,9 +5373,11 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT(spa->spa_root != NULL);
break;
+ case ZPOOL_PROP_READONLY:
case ZPOOL_PROP_CACHEFILE:
/*
- * 'cachefile' is also a non-persisitent property.
+ * 'readonly' and 'cachefile' are also non-persisitent
+ * properties.
*/
break;
default:
@@ -4033,8 +5385,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* Set pool property values in the poolprops mos object.
*/
if (spa->spa_pool_props_object == 0) {
- objset_t *mos = spa->spa_meta_objset;
-
VERIFY((spa->spa_pool_props_object =
zap_create(mos, DMU_OT_POOL_PROPS,
DMU_OT_NONE, 0, tx)) > 0);
@@ -4081,6 +5431,15 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
case ZPOOL_PROP_FAILUREMODE:
spa->spa_failmode = intval;
break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ if (tx->tx_txg != TXG_INITIAL)
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOEXPAND);
+ break;
+ case ZPOOL_PROP_DEDUPDITTO:
+ spa->spa_dedup_ditto = intval;
+ break;
default:
break;
}
@@ -4089,8 +5448,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* log internal history if this is not a zpool create */
if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
tx->tx_txg != TXG_INITIAL) {
- spa_history_internal_log(LOG_POOL_PROPSET,
- spa, tx, cr, "%s %lld %s",
+ spa_history_log_internal(LOG_POOL_PROPSET,
+ spa, tx, "%s %lld %s",
nvpair_name(elem), intval, spa_name(spa));
}
}
@@ -4099,6 +5458,42 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
/*
+ * Perform one-time upgrade on-disk changes. spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ ASSERT(spa->spa_sync_pass == 1);
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+ dsl_pool_create_origin(dp, tx);
+
+ /* Keeping the origin open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+ dsl_pool_upgrade_clones(dp, tx);
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+ dsl_pool_upgrade_dir_clones(dp, tx);
+
+ /* Keeping the freedir open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+}
+
+/*
* Sync the specified transaction group. New blocks may be dirtied as
* part of the process, so we iterate until it converges.
*/
@@ -4107,13 +5502,15 @@ spa_sync(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bplist_t *bpl = &spa->spa_sync_bplist;
+ bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
- int dirty_vdevs;
int error;
+ VERIFY(spa_writeable(spa));
+
/*
* Lock out configuration changes.
*/
@@ -4146,8 +5543,6 @@ spa_sync(spa_t *spa, uint64_t txg)
}
spa_config_exit(spa, SCL_STATE, FTAG);
- VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
-
tx = dmu_tx_create_assigned(dp, txg);
/*
@@ -4171,34 +5566,29 @@ spa_sync(spa_t *spa, uint64_t txg)
}
}
- if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
- dsl_pool_create_origin(dp, tx);
-
- /* Keeping the origin open increases spa_minref */
- spa->spa_minref += 3;
- }
-
- if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
- dsl_pool_upgrade_clones(dp, tx);
- }
-
/*
- * If anything has changed in this txg, push the deferred frees
- * from the previous txg. If not, leave them alone so that we
- * don't generate work on an otherwise idle system.
+ * If anything has changed in this txg, or if someone is waiting
+ * for this txg to sync (eg, spa_vdev_remove()), push the
+ * deferred frees from the previous txg. If not, leave them
+ * alone so that we don't generate work on an otherwise idle
+ * system.
*/
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
!txg_list_empty(&dp->dp_dirty_dirs, txg) ||
- !txg_list_empty(&dp->dp_sync_tasks, txg))
- spa_sync_deferred_frees(spa, txg);
+ !txg_list_empty(&dp->dp_sync_tasks, txg) ||
+ ((dsl_scan_active(dp->dp_scan) ||
+ txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(defer_bpo,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY3U(zio_wait(zio), ==, 0);
+ }
/*
* Iterate to convergence.
*/
do {
- spa->spa_sync_pass++;
+ int pass = ++spa->spa_sync_pass;
spa_sync_config_object(spa, tx);
spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -4208,18 +5598,26 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_errlog_sync(spa, txg);
dsl_pool_sync(dp, txg);
- dirty_vdevs = 0;
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
- vdev_sync(vd, txg);
- dirty_vdevs++;
+ if (pass <= SYNC_PASS_DEFERRED_FREE) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(free_bpl, spa_free_sync_cb,
+ zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+ } else {
+ bplist_iterate(free_bpl, bpobj_enqueue_cb,
+ defer_bpo, tx);
}
- bplist_sync(bpl, tx);
- } while (dirty_vdevs);
+ ddt_sync(spa, txg);
+ dsl_scan_sync(dp, tx);
+
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ vdev_sync(vd, txg);
- bplist_close(bpl);
+ if (pass == 1)
+ spa_sync_upgrades(spa, tx);
- dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+ } while (dmu_objset_is_dirty(mos, txg));
/*
* Rewrite the vdev configuration (which includes the uberblock)
@@ -4242,9 +5640,8 @@ spa_sync(spa_t *spa, uint64_t txg)
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
- int c;
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
if (vd->vdev_ms_array == 0 || vd->vdev_islog)
continue;
@@ -4291,10 +5688,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_ubsync = spa->spa_uberblock;
- /*
- * Clean up the ZIL records for the synced txg.
- */
- dsl_pool_zil_clean(dp);
+ dsl_pool_sync_done(dp, txg);
/*
* Update usable space statistics.
@@ -4302,6 +5696,8 @@ spa_sync(spa_t *spa, uint64_t txg)
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
vdev_sync_done(vd, txg);
+ spa_update_dspace(spa);
+
/*
* It had better be the case that we didn't dirty anything
* since vdev_config_sync().
@@ -4309,10 +5705,13 @@ spa_sync(spa_t *spa, uint64_t txg)
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
- ASSERT(bpl->bpl_queue == NULL);
+
+ spa->spa_sync_pass = 0;
spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_handle_ignored_writes(spa);
+
/*
* If any async tasks have been requested, kick them off.
*/
@@ -4330,7 +5729,8 @@ spa_sync_allpools(void)
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -4410,6 +5810,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
void
spa_upgrade(spa_t *spa, uint64_t version)
{
+ ASSERT(spa_writeable(spa));
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -4479,7 +5881,6 @@ spa_has_active_shared_spare(spa_t *spa)
void
spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
{
-#if 0
#ifdef _KERNEL
sysevent_t *ev;
sysevent_attr_list_t *attr = NULL;
@@ -4526,5 +5927,4 @@ done:
sysevent_free_attr(attr);
sysevent_free(ev);
#endif
-#endif
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index 34050ef9150a..0b8255ef3558 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -36,6 +35,7 @@
#include <sys/sunddi.h>
#ifdef _KERNEL
#include <sys/kobj.h>
+#include <sys/zone.h>
#endif
/*
@@ -74,7 +74,6 @@ spa_config_load(void)
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
- spa_t *spa;
char *pathname;
struct _buf *file;
uint64_t fsize;
@@ -88,25 +87,21 @@ spa_config_load(void)
file = kobj_open_file(pathname);
- if (file == (struct _buf *)-1) {
- ZFS_LOG(1, "Cannot open %s.", pathname);
- goto out;
- }
+ kmem_free(pathname, MAXPATHLEN);
+
+ if (file == (struct _buf *)-1)
+ return;
- if (kobj_get_filesize(file, &fsize) != 0) {
- ZFS_LOG(1, "Cannot get size of %s.", pathname);
+ if (kobj_get_filesize(file, &fsize) != 0)
goto out;
- }
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
- if (kobj_read_file(file, buf, fsize, 0) < 0) {
- ZFS_LOG(1, "Cannot read %s.", pathname);
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
goto out;
- }
/*
* Unpack the nvlist.
@@ -114,8 +109,6 @@ spa_config_load(void)
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
- ZFS_LOG(1, "File %s loaded.", pathname);
-
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
@@ -123,7 +116,6 @@ spa_config_load(void)
mutex_enter(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
@@ -131,33 +123,27 @@ spa_config_load(void)
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
- spa = spa_add(nvpair_name(nvpair), NULL);
-
- /*
- * We blindly duplicate the configuration here. If it's
- * invalid, we will catch it when the pool is first opened.
- */
- VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ (void) spa_add(nvpair_name(nvpair), child, NULL);
}
mutex_exit(&spa_namespace_lock);
nvlist_free(nvlist);
out:
- kmem_free(pathname, MAXPATHLEN);
if (buf != NULL)
kmem_free(buf, fsize);
- if (file != (struct _buf *)-1)
- kobj_close_file(file);
+
+ kobj_close_file(file);
}
static void
spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
{
- int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
- char *buf, *temp;
size_t buflen;
+ char *buf;
vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char *temp;
/*
* If the nvlist is empty (NULL), then remove the old cachefile.
@@ -328,6 +314,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
vdev_t *rvd = spa->spa_root_vdev;
unsigned long hostid = 0;
boolean_t locked = B_FALSE;
+ uint64_t split_guid;
if (vd == NULL) {
vd = rvd;
@@ -356,7 +343,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
txg) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
spa_guid(spa)) == 0);
+#ifdef _KERNEL
+ hostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so we can't use
+ * zone_get_hostid().
+ */
(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+#endif /* _KERNEL */
if (hostid != 0) {
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
hostid) == 0);
@@ -376,12 +371,63 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
1ULL) == 0);
vd = vd->vdev_top; /* label contains top config */
+ } else {
+ /*
+ * Only add the (potentially large) split information
+ * in the mos config, and not in the vdev labels
+ */
+ if (spa->spa_config_splitting != NULL)
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ spa->spa_config_splitting) == 0);
}
- nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace.
+ */
+ vdev_top_config_generate(spa, config);
+
+ /*
+ * If we're splitting, record the original pool's guid.
+ */
+ if (spa->spa_config_splitting != NULL &&
+ nvlist_lookup_uint64(spa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+ split_guid) == 0);
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, 0);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
+ if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+
+ ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+ kmem_free(ddh, sizeof (ddt_histogram_t));
+
+ ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+ ddt_get_dedup_object_stats(spa, ddo);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+ kmem_free(ddo, sizeof (ddt_object_t));
+
+ dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+ ddt_get_dedup_stats(spa, dds);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+ kmem_free(dds, sizeof (ddt_stat_t));
+ }
+
if (locked)
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
@@ -416,10 +462,9 @@ spa_config_update(spa_t *spa, int what)
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ms_array == 0) {
- vdev_init(tvd, txg);
- vdev_config_dirty(tvd);
- }
+ if (tvd->vdev_ms_array == 0)
+ vdev_metaslab_set_size(tvd);
+ vdev_expand(tvd, txg);
}
}
spa_config_exit(spa, SCL_ALL, FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
index e1ae4917137a..282140b3bd65 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -54,38 +53,6 @@
#include <sys/zap.h>
#include <sys/zio.h>
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-#ifdef _KERNEL
-uint64_t
-_strtonum(const char *str, char **nptr)
-{
- uint64_t val = 0;
- char c;
- int digit;
-
- while ((c = *str) != '\0') {
- if (c >= '0' && c <= '9')
- digit = c - '0';
- else if (c >= 'a' && c <= 'f')
- digit = 10 + c - 'a';
- else
- break;
-
- val *= 16;
- val += digit;
-
- str++;
- }
-
- if (nptr)
- *nptr = (char *)str;
-
- return (val);
-}
-#endif
/*
* Convert a bookmark to a string.
@@ -105,13 +72,13 @@ bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
static void
name_to_bookmark(char *buf, zbookmark_t *zb)
{
- zb->zb_objset = _strtonum(buf, &buf);
+ zb->zb_objset = strtonum(buf, &buf);
ASSERT(*buf == ':');
- zb->zb_object = _strtonum(buf + 1, &buf);
+ zb->zb_object = strtonum(buf + 1, &buf);
ASSERT(*buf == ':');
- zb->zb_level = (int)_strtonum(buf + 1, &buf);
+ zb->zb_level = (int)strtonum(buf + 1, &buf);
ASSERT(*buf == ':');
- zb->zb_blkid = _strtonum(buf + 1, &buf);
+ zb->zb_blkid = strtonum(buf + 1, &buf);
ASSERT(*buf == '\0');
}
#endif
@@ -134,7 +101,7 @@ spa_log_error(spa_t *spa, zio_t *zio)
* If we are trying to import a pool, ignore any errors, as we won't be
* writing to the pool any time soon.
*/
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
return;
mutex_enter(&spa->spa_errlist_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
index b403ccbcc444..942636b906ce 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/spa.h>
@@ -32,6 +31,7 @@
#include <sys/dmu_objset.h>
#include <sys/utsname.h>
#include <sys/sunddi.h>
+#include "zfs_comutil.h"
#ifdef _KERNEL
#include <sys/cmn_err.h>
#include <sys/zone.h>
@@ -103,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
* Figure out maximum size of history log. We set it at
* 1% of pool size, with a max of 32MB and min of 128KB.
*/
- shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off =
+ metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
@@ -187,8 +188,9 @@ spa_history_zone()
/*
* Write out a history event.
*/
+/*ARGSUSED*/
static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
history_arg_t *hap = arg2;
@@ -231,9 +233,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
gethrestime_sec()) == 0);
- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
- (uint64_t)crgetuid(cr)) == 0);
- if (hap->ha_zone[0] != '\0')
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0);
+ if (hap->ha_zone != NULL)
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
hap->ha_zone) == 0);
#ifdef _KERNEL
@@ -244,6 +245,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
hap->ha_log_type == LOG_CMD_NORMAL) {
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
history_str) == 0);
+
+ zfs_dbgmsg("command: %s", history_str);
} else {
VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
hap->ha_event) == 0);
@@ -251,6 +254,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
tx->tx_txg) == 0);
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
history_str) == 0);
+
+ zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
+ zfs_history_event_names[hap->ha_event], spa_name(spa),
+ (longlong_t)tx->tx_txg, history_str);
+
}
VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
@@ -279,10 +287,10 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
kmem_free(record_packed, reclen);
dmu_buf_rele(dbp, FTAG);
- if (hap->ha_log_type == LOG_INTERNAL) {
- kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
- kmem_free(hap, sizeof (history_arg_t));
- }
+ strfree(hap->ha_history_str);
+ if (hap->ha_zone != NULL)
+ strfree(hap->ha_zone);
+ kmem_free(hap, sizeof (history_arg_t));
}
/*
@@ -291,15 +299,32 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
int
spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
{
- history_arg_t ha;
+ history_arg_t *ha;
+ int err = 0;
+ dmu_tx_t *tx;
ASSERT(what != LOG_INTERNAL);
- ha.ha_history_str = history_str;
- ha.ha_log_type = what;
- (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
- spa, &ha, 0));
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+ ha->ha_history_str = strdup(history_str);
+ ha->ha_zone = strdup(spa_history_zone());
+ ha->ha_log_type = what;
+ ha->ha_uid = crgetuid(CRED());
+
+ /* Kick this off asynchronously; errors are ignored. */
+ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
+ spa_history_log_sync, spa, ha, 0, tx);
+ dmu_tx_commit(tx);
+
+ /* spa_history_log_sync will free ha and strings */
+ return (err);
}
/*
@@ -322,6 +347,14 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
if (!spa->spa_history)
return (ENOENT);
+ /*
+ * The history is logged asynchronously, so when they request
+ * the first chunk of history, make sure everything has been
+ * synced to disk so that we get it.
+ */
+ if (*offp == 0 && spa_writeable(spa))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
return (err);
shpp = dbp->db_data;
@@ -391,13 +424,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
return (err);
}
-void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+static void
+log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, va_list adx)
{
- history_arg_t *hap;
- char *str;
- va_list adx;
+ history_arg_t *ha;
+ va_list adx2;
/*
* If this is part of creating a pool, not everything is
@@ -406,23 +438,71 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
if (tx->tx_txg == TXG_INITIAL)
return;
- hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
- str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+ va_copy(adx2, adx);
- va_start(adx, fmt);
- (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
- va_end(adx);
+ ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+ ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx2) + 1,
+ KM_SLEEP);
+
+ va_end(adx2);
+
+ (void) vsprintf(ha->ha_history_str, fmt, adx);
- hap->ha_log_type = LOG_INTERNAL;
- hap->ha_history_str = str;
- hap->ha_event = event;
- hap->ha_zone[0] = '\0';
+ ha->ha_log_type = LOG_INTERNAL;
+ ha->ha_event = event;
+ ha->ha_zone = NULL;
+ ha->ha_uid = 0;
if (dmu_tx_is_syncing(tx)) {
- spa_history_log_sync(spa, hap, cr, tx);
+ spa_history_log_sync(spa, ha, tx);
} else {
dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
- spa_history_log_sync, spa, hap, 0, tx);
+ spa_history_log_sync, spa, ha, 0, tx);
}
- /* spa_history_log_sync() will free hap and str */
+ /* spa_history_log_sync() will free ha and strings */
+}
+
+void
+spa_history_log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ dmu_tx_t *htx = tx;
+ va_list adx;
+
+ /* create a tx if we didn't get one */
+ if (tx == NULL) {
+ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+ dmu_tx_abort(htx);
+ return;
+ }
+ }
+
+ va_start(adx, fmt);
+ log_internal(event, spa, htx, fmt, adx);
+ va_end(adx);
+
+ /* if we didn't get a tx from the caller, commit the one we made */
+ if (tx == NULL)
+ dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_version(spa_t *spa, history_internal_events_t event)
+{
+#ifdef _KERNEL
+ uint64_t current_vers = spa_version(spa);
+
+ if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
+ spa_history_log_internal(event, spa, NULL,
+ "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
+ (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+ utsname.nodename, utsname.release, utsname.version,
+ utsname.machine);
+ }
+ cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
+ event == LOG_POOL_IMPORT ? "imported" :
+ event == LOG_POOL_CREATE ? "created" : "accessed",
+ (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
+#endif
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 89e0301873cf..1709f6884c9d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,10 +40,11 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
-#include <sys/sunddi.h>
#include <sys/arc.h>
+#include <sys/ddt.h>
#include "zfs_prop.h"
/*
@@ -186,7 +186,7 @@
*
* SCL_VDEV
* Held as reader to prevent changes to the vdev tree during trivial
- * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the
+ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
* other locks, and lower than all of them, to ensure that it's safe
* to acquire regardless of caller context.
*
@@ -314,8 +314,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
void
spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
{
+ int wlocks_held = 0;
+
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (scl->scl_writer == curthread)
+ wlocks_held |= (1 << i);
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
@@ -335,6 +339,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
(void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
+ ASSERT(wlocks_held <= locks);
}
void
@@ -419,7 +424,7 @@ spa_lookup(const char *name)
* exist by calling spa_lookup() first.
*/
spa_t *
-spa_add(const char *name, const char *altroot)
+spa_add(const char *name, nvlist_t *config, const char *altroot)
{
spa_t *spa;
spa_config_dirent_t *dp;
@@ -429,29 +434,36 @@ spa_add(const char *name, const char *altroot)
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_create(&spa->spa_free_bplist[t]);
+
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
spa->spa_freeze_txg = UINT64_MAX;
spa->spa_final_txg = UINT64_MAX;
+ spa->spa_load_max_txg = UINT64_MAX;
+ spa->spa_proc = &p0;
+ spa->spa_proc_state = SPA_PROC_NONE;
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
avl_add(&spa_namespace_avl, spa);
- mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-
/*
* Set the alternate root, if there is one.
*/
@@ -467,9 +479,15 @@ spa_add(const char *name, const char *altroot)
offsetof(spa_config_dirent_t, scd_link));
dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
- dp->scd_path = spa_strdup(spa_config_path);
+ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
list_insert_head(&spa->spa_config_list, dp);
+ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+
+ if (config != NULL)
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
return (spa);
}
@@ -486,6 +504,8 @@ spa_remove(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ nvlist_free(spa->spa_config_splitting);
+
avl_remove(&spa_namespace_avl, spa);
cv_broadcast(&spa_namespace_cv);
@@ -503,24 +523,30 @@ spa_remove(spa_t *spa)
list_destroy(&spa->spa_config_list);
+ nvlist_free(spa->spa_load_info);
spa_config_set(spa, NULL);
refcount_destroy(&spa->spa_refcount);
spa_config_lock_destroy(spa);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_destroy(&spa->spa_free_bplist[t]);
+
cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_proc_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_async_lock);
- mutex_destroy(&spa->spa_scrub_lock);
- mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_errlist_lock);
- mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_proc_lock);
mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
+ mutex_destroy(&spa->spa_vdev_top_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -814,12 +840,6 @@ spa_l2cache_activate(vdev_t *vd)
mutex_exit(&spa_l2cache_lock);
}
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
- vdev_space_update(vd, space, alloc, B_FALSE);
-}
-
/*
* ==========================================================================
* SPA vdev locking
@@ -834,7 +854,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
uint64_t
spa_vdev_enter(spa_t *spa)
{
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
@@ -842,14 +875,14 @@ spa_vdev_enter(spa_t *spa)
}
/*
- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
*/
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
int config_changed = B_FALSE;
ASSERT(txg > spa_last_synced_txg(spa));
@@ -861,17 +894,28 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
- /*
- * If the config changed, notify the scrub thread that it must restart.
- */
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
- dsl_pool_scrub_restart(spa->spa_dsl_pool);
config_changed = B_TRUE;
+ spa->spa_config_generation++;
}
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+
spa_config_exit(spa, SCL_ALL, spa);
/*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag, 0);
+
+ /*
* Note: this txg_wait_synced() is important because it ensures
* that there won't be more than one config change per txg.
* This allows us to use the txg as the generation number.
@@ -891,8 +935,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
if (config_changed)
spa_config_sync(spa, B_FALSE, B_TRUE);
+}
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
@@ -901,18 +957,52 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
* Lock the given spa_t for the purpose of changing vdev state.
*/
void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
{
- spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+ int locks = SCL_STATE_ALL | oplocks;
+
+ /*
+ * Root pools may need to read of the underlying devfs filesystem
+ * when opening up a vdev. Unfortunately if we're holding the
+ * SCL_ZIO lock it will result in a deadlock when we try to issue
+ * the read from the root filesystem. Instead we "prefetch"
+ * the associated vnodes that we need prior to opening the
+ * underlying devices and cache them so that we can prevent
+ * any I/O when we are doing the actual open.
+ */
+ if (spa_is_root(spa)) {
+ int low = locks & ~(SCL_ZIO - 1);
+ int high = locks & ~low;
+
+ spa_config_enter(spa, high, spa, RW_WRITER);
+ vdev_hold(spa->spa_root_vdev);
+ spa_config_enter(spa, low, spa, RW_WRITER);
+ } else {
+ spa_config_enter(spa, locks, spa, RW_WRITER);
+ }
+ spa->spa_vdev_locks = locks;
}
int
spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
{
- if (vd != NULL)
+ boolean_t config_changed = B_FALSE;
+
+ if (vd != NULL || error == 0)
+ vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+ 0, 0, B_FALSE);
+
+ if (vd != NULL) {
vdev_state_dirty(vd->vdev_top);
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ if (spa_is_root(spa))
+ vdev_rele(spa->spa_root_vdev);
- spa_config_exit(spa, SCL_STATE_ALL, spa);
+ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+ spa_config_exit(spa, spa->spa_vdev_locks, spa);
/*
* If anything changed, wait for it to sync. This ensures that,
@@ -923,6 +1013,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
if (vd != NULL)
txg_wait_synced(spa->spa_dsl_pool, 0);
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+ mutex_exit(&spa_namespace_lock);
+ }
+
return (error);
}
@@ -982,14 +1081,13 @@ spa_rename(const char *name, const char *newname)
return (0);
}
-
/*
- * Determine whether a pool with given pool_guid exists. If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
+ * Return the spa_t associated with given pool_guid, if it exists. If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
*/
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
{
spa_t *spa;
avl_tree_t *t = &spa_namespace_avl;
@@ -1020,7 +1118,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
}
}
- return (spa != NULL);
+ return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ return (spa_by_guid(pool_guid, device_guid) != NULL);
}
char *
@@ -1055,48 +1162,36 @@ spa_get_random(uint64_t range)
return (r % range);
}
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+uint64_t
+spa_generate_guid(spa_t *spa)
{
- int d;
+ uint64_t guid = spa_get_random(-1ULL);
- if (bp == NULL) {
- (void) snprintf(buf, len, "<NULL>");
- return;
+ if (spa != NULL) {
+ while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ } else {
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
}
- if (BP_IS_HOLE(bp)) {
- (void) snprintf(buf, len, "<hole>");
- return;
- }
+ return (guid);
+}
+
+void
+sprintf_blkptr(char *buf, const blkptr_t *bp)
+{
+ char *type = NULL;
+ char *checksum = NULL;
+ char *compress = NULL;
- (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
- (u_longlong_t)BP_GET_LEVEL(bp),
- dmu_ot[BP_GET_TYPE(bp)].ot_name,
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp));
-
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- const dva_t *dva = &bp->blk_dva[d];
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "DVA[%d]=<%llu:%llx:%llx> ", d,
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
- (u_longlong_t)DVA_GET_ASIZE(dva));
+ if (bp != NULL) {
+ type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+ checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
- BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
- BP_IS_GANG(bp) ? "gang" : "contiguous",
- (u_longlong_t)bp->blk_birth,
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
}
void
@@ -1233,59 +1328,55 @@ spa_first_txg(spa_t *spa)
return (spa->spa_first_txg);
}
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+ return (spa->spa_syncing_txg);
+}
+
pool_state_t
spa_state(spa_t *spa)
{
return (spa->spa_state);
}
-uint64_t
-spa_freeze_txg(spa_t *spa)
+spa_load_state_t
+spa_load_state(spa_t *spa)
{
- return (spa->spa_freeze_txg);
+ return (spa->spa_load_state);
}
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
uint64_t
-spa_get_alloc(spa_t *spa)
+spa_freeze_txg(spa_t *spa)
{
- return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+ return (spa->spa_freeze_txg);
}
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
+/* ARGSUSED */
uint64_t
-spa_get_space(spa_t *spa)
+spa_get_asize(spa_t *spa, uint64_t lsize)
{
- return (spa->spa_root_vdev->vdev_stat.vs_space);
+ /*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().
+ */
+ return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
}
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
uint64_t
spa_get_dspace(spa_t *spa)
{
- if (spa->spa_deflate)
- return (spa->spa_root_vdev->vdev_stat.vs_dspace);
- else
- return (spa->spa_root_vdev->vdev_stat.vs_space);
+ return (spa->spa_dspace);
}
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
+void
+spa_update_dspace(spa_t *spa)
{
- /*
- * For now, the worst case is 512-byte RAID-Z blocks, in which
- * case the space requirement is exactly 2x; so just assume that.
- * Add to this the fact that we can have up to 3 DVAs per bp, and
- * we have to multiply by a total of 6x.
- */
- return (lsize * 6);
+ spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+ ddt_get_dedup_dspace(spa);
}
/*
@@ -1310,6 +1401,24 @@ spa_version(spa_t *spa)
return (spa->spa_ubsync.ub_version);
}
+boolean_t
+spa_deflate(spa_t *spa)
+{
+ return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+ return (spa->spa_log_class);
+}
+
int
spa_max_replication(spa_t *spa)
{
@@ -1323,24 +1432,52 @@ spa_max_replication(spa_t *spa)
return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
+int
+spa_prev_software_version(spa_t *spa)
+{
+ return (spa->spa_prev_software_version);
+}
+
uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
- int sz = 0, i;
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
- if (!spa->spa_deflate)
- return (BP_GET_ASIZE(bp));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- vdev_t *vd =
- vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- if (vd)
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
- SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ if (asize != 0 && spa->spa_deflate) {
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
spa_config_exit(spa, SCL_VDEV, FTAG);
- return (sz);
+
+ return (dsize);
}
/*
@@ -1442,9 +1579,18 @@ spa_has_slogs(spa_t *spa)
return (spa->spa_log_class->mc_rotor != NULL);
}
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+ return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+ spa->spa_log_state = state;
+}
+
boolean_t
spa_is_root(spa_t *spa)
{
@@ -1462,3 +1608,69 @@ spa_mode(spa_t *spa)
{
return (spa->spa_mode);
}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+ return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+ return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+ return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+ return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+ /* data not stored on disk */
+ spa->spa_scan_pass_start = gethrestime_sec();
+ spa->spa_scan_pass_exam = 0;
+ vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ return (ENOENT);
+ bzero(ps, sizeof (pool_scan_stat_t));
+
+ /* data stored on disk */
+ ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_start_time = scn->scn_phys.scn_start_time;
+ ps->pss_end_time = scn->scn_phys.scn_end_time;
+ ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_processed = scn->scn_phys.scn_processed;
+ ps->pss_errors = scn->scn_phys.scn_errors;
+ ps->pss_state = scn->scn_phys.scn_state;
+
+ /* data not stored on disk */
+ ps->pss_pass_start = spa->spa_scan_pass_start;
+ ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index d0251419cbc4..1ce7b2a3d466 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -258,8 +258,10 @@ space_map_load_wait(space_map_t *sm)
{
ASSERT(MUTEX_HELD(sm->sm_lock));
- while (sm->sm_loading)
+ while (sm->sm_loading) {
+ ASSERT(!sm->sm_loaded);
cv_wait(&sm->sm_load_cv, sm->sm_lock);
+ }
}
/*
@@ -276,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
int error = 0;
ASSERT(MUTEX_HELD(sm->sm_lock));
-
- space_map_load_wait(sm);
-
- if (sm->sm_loaded)
- return (0);
+ ASSERT(!sm->sm_loaded);
+ ASSERT(!sm->sm_loading);
sm->sm_loading = B_TRUE;
end = smo->smo_objsize;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index f52851d69f46..8f189c62d31d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ARC_H
@@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
- krwlock_t b_lock;
+ kmutex_t b_evict_lock;
+ krwlock_t b_data_lock;
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
@@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb);
int arc_released(arc_buf_t *buf);
int arc_has_callback(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
@@ -99,27 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf);
int arc_referenced(arc_buf_t *buf);
#endif
-typedef struct writeprops {
- dmu_object_type_t wp_type;
- uint8_t wp_level;
- uint8_t wp_copies;
- uint8_t wp_dncompress, wp_oscompress;
- uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
int arc_buf_evict(arc_buf_t *buf);
@@ -135,7 +127,7 @@ void arc_fini(void);
* Level 2 ARC
*/
-void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_init(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
index cdb93a6c35a3..471be9047ec2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -19,68 +19,36 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_BPLIST_H
#define _SYS_BPLIST_H
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
#include <sys/zfs_context.h>
+#include <sys/spa.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct bplist_phys {
- /*
- * This is the bonus buffer for the dead lists. The object's
- * contents is an array of bpl_entries blkptr_t's, representing
- * a total of bpl_bytes physical space.
- */
- uint64_t bpl_entries;
- uint64_t bpl_bytes;
- uint64_t bpl_comp;
- uint64_t bpl_uncomp;
-} bplist_phys_t;
-
-#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
- blkptr_t bpq_blk;
- void *bpq_next;
-} bplist_q_t;
+typedef struct bplist_entry {
+ blkptr_t bpe_blk;
+ list_node_t bpe_node;
+} bplist_entry_t;
typedef struct bplist {
kmutex_t bpl_lock;
- objset_t *bpl_mos;
- uint64_t bpl_object;
- uint8_t bpl_blockshift;
- uint8_t bpl_bpshift;
- uint8_t bpl_havecomp;
- bplist_q_t *bpl_queue;
- bplist_phys_t *bpl_phys;
- dmu_buf_t *bpl_dbuf;
- dmu_buf_t *bpl_cached_dbuf;
+ list_t bpl_list;
} bplist_t;
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
- uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
- uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+ void *arg, dmu_tx_t *tx);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
new file mode 100644
index 000000000000..3771a9541aa7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPOBJ_H
+#define _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpo_entries blkptr_t's, representing
+ * a total of bpo_bytes physical space.
+ */
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+ kmutex_t bpo_lock;
+ objset_t *bpo_os;
+ uint64_t bpo_object;
+ int bpo_epb;
+ uint8_t bpo_havecomp;
+ uint8_t bpo_havesubobj;
+ bpobj_phys_t *bpo_phys;
+ dmu_buf_t *bpo_dbuf;
+ dmu_buf_t *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 7e2754d000b4..cf1bbc030f45 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DBUF_H
#define _SYS_DBUF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
@@ -35,12 +32,12 @@
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
#endif
-#define DB_BONUS_BLKID (-1ULL)
#define IN_DMU_SYNC 2
/*
@@ -55,25 +52,28 @@ extern "C" {
#define DB_RF_CACHED (1 << 5)
/*
- * The state transition diagram for dbufs looks like:
+ * The simplified state transition diagram for dbufs looks like:
*
* +----> READ ----+
* | |
* | V
* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
- * | ^
- * | |
- * +----> FILL ----+
+ * | ^ ^
+ * | | |
+ * +----> FILL ----+ |
+ * | |
+ * | |
+ * +--------> NOFILL -------+
*/
typedef enum dbuf_states {
DB_UNCACHED,
DB_FILL,
+ DB_NOFILL,
DB_READ,
DB_CACHED,
DB_EVICTING
} dbuf_states_t;
-struct objset_impl;
struct dnode;
struct dmu_tx;
@@ -83,9 +83,6 @@ struct dmu_tx;
* etc.
*/
-#define LIST_LINK_INACTIVE(link) \
- ((link)->list_next == NULL && (link)->list_prev == NULL)
-
struct dmu_buf_impl;
typedef enum override_states {
@@ -132,6 +129,7 @@ typedef struct dbuf_dirty_record {
arc_buf_t *dr_data;
blkptr_t dr_overridden_by;
override_states_t dr_override_state;
+ uint8_t dr_copies;
} dl;
} dt;
} dbuf_dirty_record_t;
@@ -146,18 +144,20 @@ typedef struct dmu_buf_impl {
dmu_buf_t db;
/* the objset we belong to */
- struct objset_impl *db_objset;
+ struct objset *db_objset;
/*
- * the dnode we belong to (NULL when evicted)
+ * handle to safely access the dnode we belong to (NULL when evicted)
*/
- struct dnode *db_dnode;
+ struct dnode_handle *db_dnode_handle;
/*
* our parent buffer; if the dnode points to us directly,
- * db_parent == db_dnode->dn_dbuf
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
* only accessed by sync thread ???
* (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
*/
struct dmu_buf_impl *db_parent;
@@ -240,6 +240,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
@@ -253,17 +257,19 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
@@ -271,30 +277,53 @@ void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define DB_GET_SPA(_spa_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_spa_p) = __dn->dn_objset->os_spa; \
+ DB_DNODE_EXIT(_db); \
+}
+#define DB_GET_OBJSET(_os_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_os_p) = __dn->dn_objset; \
+ DB_DNODE_EXIT(_db); \
+}
+
void dbuf_init(void);
void dbuf_fini(void);
-#define DBUF_IS_METADATA(db) \
- ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_IS_METADATA(_db) \
+ (dbuf_is_metadata(_db))
-#define DBUF_GET_BUFC_TYPE(db) \
- (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-#define DBUF_IS_CACHEABLE(db) \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-#define DBUF_IS_L2CACHEABLE(db) \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
@@ -322,10 +351,10 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ sprintf_blkptr(__blkbuf, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
+ } \
_NOTE(CONSTCOND) } while (0)
#define DBUF_VERIFY(db) dbuf_verify(db)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
new file mode 100644
index 000000000000..9724d6ecebb0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+ DDT_TYPE_ZAP = 0,
+ DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+ DDT_CLASS_DITTO = 0,
+ DDT_CLASS_DUPLICATE,
+ DDT_CLASS_UNIQUE,
+ DDT_CLASSES
+};
+
+#define DDT_TYPE_CURRENT 0
+
+#define DDT_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_COMPRESS_FUNCTION_MASK 0x7f
+
+/*
+ * On-disk ddt entry: key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+ zio_cksum_t ddk_cksum; /* 256-bit block checksum */
+ uint64_t ddk_prop; /* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define DDK_GET_LSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_LSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_PSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_PSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
+#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+ dva_t ddp_dva[SPA_DVAS_PER_BP];
+ uint64_t ddp_refcnt;
+ uint64_t ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+ DDT_PHYS_DITTO = 0,
+ DDT_PHYS_SINGLE = 1,
+ DDT_PHYS_DOUBLE = 2,
+ DDT_PHYS_TRIPLE = 3,
+ DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+ ddt_key_t dde_key;
+ ddt_phys_t dde_phys[DDT_PHYS_TYPES];
+ zio_t *dde_lead_zio[DDT_PHYS_TYPES];
+ void *dde_repair_data;
+ enum ddt_type dde_type;
+ enum ddt_class dde_class;
+ uint8_t dde_loading;
+ uint8_t dde_loaded;
+ kcondvar_t dde_cv;
+ avl_node_t dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+ kmutex_t ddt_lock;
+ avl_tree_t ddt_tree;
+ avl_tree_t ddt_repair_tree;
+ enum zio_checksum ddt_checksum;
+ spa_t *ddt_spa;
+ objset_t *ddt_os;
+ uint64_t ddt_stat_object;
+ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+ ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+ avl_node_t ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+ uint64_t ddb_class;
+ uint64_t ddb_type;
+ uint64_t ddb_checksum;
+ uint64_t ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+ char ddt_op_name[32];
+ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+ boolean_t prehash);
+ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+ void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+ ddt_entry_t *dde);
+ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ uint64_t *walk);
+ uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define DDT_NAMELEN 80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+ uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+ const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+ uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+ const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 08c30c8ed015..4f91a91a5d61 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -19,15 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_DMU_H
#define _SYS_DMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file describes the interface that the DMU provides for its
* consumers.
@@ -39,12 +38,14 @@
#include <sys/types.h>
#include <sys/param.h>
#include <sys/cred.h>
+#include <sys/time.h>
#ifdef __cplusplus
extern "C" {
#endif
struct uio;
+struct xuio;
struct page;
struct vnode;
struct spa;
@@ -60,8 +61,9 @@ struct drr_end;
struct zbookmark;
struct spa;
struct nvlist;
-struct objset_impl;
struct arc_buf;
+struct zio_prop;
+struct sa_handle;
struct file;
typedef struct objset objset_t;
@@ -75,8 +77,8 @@ typedef enum dmu_object_type {
DMU_OT_OBJECT_ARRAY, /* UINT64 */
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
- DMU_OT_BPLIST, /* UINT64 */
- DMU_OT_BPLIST_HDR, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
/* spa: */
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
DMU_OT_SPACE_MAP, /* UINT64 */
@@ -116,9 +118,22 @@ typedef enum dmu_object_type {
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */
- DMU_OT_SCRUB_QUEUE, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -141,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size);
void zfs_acl_byteswap(void *buf, size_t size);
void zfs_znode_byteswap(void *buf, size_t size);
-#define DS_MODE_NOHOLD 0 /* internal use only */
-#define DS_MODE_USER 1 /* simple access, no special needs */
-#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */
-#define DS_MODE_TYPE_MASK 0x3
-#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK)
-#define DS_MODE_READONLY 0x8
-#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
-#define DS_MODE_INCONSISTENT 0x10
-#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT)
-
#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
@@ -163,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
+#define DMU_DEADLIST_OBJECT (-3ULL)
/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define DMU_BONUS_BLKID (-1ULL)
+#define DMU_SPILL_BLKID (-2ULL)
+/*
* Public routines to create, destroy, open, and close objsets.
*/
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
- boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
int dmu_objset_rename(const char *name, const char *newname,
boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
@@ -202,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_DIRECTORY_OBJECT 1
#define DMU_POOL_CONFIG "config"
#define DMU_POOL_ROOT_DATASET "root_dataset"
-#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
#define DMU_POOL_ERRLOG_LAST "errlog_last"
#define DMU_POOL_SPARES "spares"
@@ -210,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_HISTORY "history"
#define DMU_POOL_PROPS "pool_props"
#define DMU_POOL_L2CACHE "l2cache"
-
-/* 4x8 zbookmark_t */
-#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define DMU_POOL_SCRUB_QUEUE "scrub_queue"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define DMU_POOL_SCRUB_FUNC "scrub_func"
-/* 1x8 count */
-#define DMU_POOL_SCRUB_ERRORS "scrub_errors"
+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_DDT "DDT-%s-%s-%s"
+#define DMU_POOL_DDT_STATS "DDT-statistics"
+#define DMU_POOL_CREATION_VERSION "creation_version"
+#define DMU_POOL_SCAN "scan"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
/*
* Allocate an object from this objset. The range of object numbers
@@ -307,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
/*
- * Decide how many copies of a given block we should make. Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
*/
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
- dmu_object_type_t ot);
+#define WP_NOFILL 0x1
+#define WP_DMU_SYNC 0x2
+#define WP_SPILL 0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+ struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -325,6 +334,18 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+ void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
/*
* Obtain the DMU buffer from the specified object which contains the
@@ -341,7 +362,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **);
+ void *tag, dmu_buf_t **, int flags);
void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
void dmu_buf_rele(dmu_buf_t *db, void *tag);
uint64_t dmu_buf_refcount(dmu_buf_t *db);
@@ -438,12 +459,35 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+
+/*
* Free up the data blocks for a defined range of a file. If size is
* zero, the range from offset to end-of-file is freed.
*/
@@ -465,15 +509,28 @@ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
@@ -484,19 +541,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
uint64_t len);
typedef struct dmu_object_info {
- /* All sizes are in bytes. */
+ /* All sizes are in bytes unless otherwise indicated. */
uint32_t doi_data_block_size;
uint32_t doi_metadata_block_size;
- uint64_t doi_bonus_size;
dmu_object_type_t doi_type;
dmu_object_type_t doi_bonus_type;
+ uint64_t doi_bonus_size;
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_pad[5];
- /* Values below are number of 512-byte blocks. */
- uint64_t doi_physical_blks; /* data + metadata */
- uint64_t doi_max_block_offset;
+ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
+ uint64_t doi_max_offset;
+ uint64_t doi_fill_count; /* number of non-empty blocks */
} dmu_object_info_t;
typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -565,6 +622,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
*/
uint64_t dmu_objset_fsid_guid(objset_t *os);
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
int dmu_objset_is_snapshot(objset_t *os);
extern struct spa *dmu_objset_spa(objset_t *os);
@@ -574,6 +636,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_syncprop(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -581,9 +645,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp);
-typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
- void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
- dmu_tx_t *tx);
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+ void *bonus, uint64_t *userp, uint64_t *groupp);
extern void dmu_objset_register_type(dmu_objset_type_t ost,
objset_used_cb_t *cb);
extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
@@ -604,9 +667,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
* storage when the write completes this new data does not become a
* permanent part of the file until the associated transaction commits.
*/
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
- struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+ struct zilog *zgd_zilog;
+ struct blkptr *zgd_bp;
+ dmu_buf_t *zgd_db;
+ struct rl *zgd_rl;
+ void *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
/*
* Find the next hole or data block in file starting at *off
@@ -641,15 +715,19 @@ typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_real_ds;
struct drr_begin *drc_drrb;
char *drc_tosnap;
+ char *drc_top_ds;
boolean_t drc_newfs;
boolean_t drc_force;
} dmu_recv_cookie_t;
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp);
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep);
int dmu_recv_end(dmu_recv_cookie_t *drc);
-void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
+
+int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp,
+ offset_t *off);
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
index 96ce688e1551..2cb7f121cc0d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,6 +29,7 @@
#include <sys/txg_impl.h>
#include <sys/zio.h>
#include <sys/dnode.h>
+#include <sys/kstat.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -210,11 +211,11 @@ extern "C" {
*
* ds_lock
* protects:
- * ds_user_ptr
- * ds_user_evice_func
+ * ds_objset
* ds_open_refcount
* ds_snapname
* ds_phys accounting
+ * ds_phys userrefs zapobj
* ds_reserved
* held from:
* dsl_dataset_*
@@ -232,6 +233,39 @@ extern "C" {
struct objset;
struct dmu_pool;
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index a8022d2eaa8f..d687642b3f14 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_DMU_OBJSET_H
#define _SYS_DMU_OBJSET_H
@@ -33,18 +34,23 @@
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
#endif
+extern krwlock_t os_lock;
+
struct dsl_dataset;
struct dmu_tx;
-struct objset_impl;
#define OBJSET_PHYS_SIZE 2048
#define OBJSET_OLD_PHYS_SIZE 1024
+#define OBJSET_BUF_HAS_USERUSED(buf) \
+ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
typedef struct objset_phys {
@@ -59,26 +65,32 @@ typedef struct objset_phys {
} objset_phys_t;
struct objset {
- struct objset_impl *os;
- int os_mode;
-};
-
-typedef struct objset_impl {
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
spa_t *os_spa;
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
- dnode_t *os_meta_dnode;
- dnode_t *os_userused_dnode;
- dnode_t *os_groupused_dnode;
+ /*
+ * The following "special" dnodes have no parent and are exempt from
+ * dnode_move(), but they root their descendents in this objset using
+ * handles anyway, so that all access to dnodes from dbufs consistently
+ * uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
- objset_t os;
- uint8_t os_checksum; /* can change, under dsl_dir's locks */
- uint8_t os_compress; /* can change, under dsl_dir's locks */
- uint8_t os_copies; /* can change, under dsl_dir's locks */
- uint8_t os_primary_cache; /* can change, under dsl_dir's locks */
- uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */
+
+ /* can change, under dsl_dir's locks: */
+ uint8_t os_checksum;
+ uint8_t os_compress;
+ uint8_t os_copies;
+ uint8_t os_dedup_checksum;
+ uint8_t os_dedup_verify;
+ uint8_t os_logbias;
+ uint8_t os_primary_cache;
+ uint8_t os_secondary_cache;
+ uint8_t os_sync;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -101,51 +113,69 @@ typedef struct objset_impl {
/* stuff we store for the user */
kmutex_t os_user_ptr_lock;
void *os_user_ptr;
-} objset_impl_t;
+ /* SA layout/attribute registration */
+ sa_os_t *os_sa;
+};
+
+#define DMU_META_OBJSET 0
#define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
#define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
(os)->os_secondary_cache == ZFS_CACHE_METADATA)
/* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
- boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
void dmu_objset_stats(objset_t *os, nvlist_t *nv);
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp);
uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
int flags);
int dmu_objset_find_spa(spa_t *spa, const char *name,
int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(char *name, void *arg);
+int dmu_objset_prefetch(const char *name, void *arg);
void dmu_objset_byteswap(void *buf, size_t size);
int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
/* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+boolean_t dmu_objset_is_dirty_anywhere(objset_t *os);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
- objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
-boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+ objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
index 3e026891153c..5b326cd99c09 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DMU_TRAVERSE_H
@@ -36,19 +35,27 @@ extern "C" {
struct dnode_phys;
struct dsl_dataset;
+struct zilog;
+struct arc_buf;
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
- const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+ void *arg);
#define TRAVERSE_PRE (1<<0)
#define TRAVERSE_POST (1<<1)
#define TRAVERSE_PREFETCH_METADATA (1<<2)
#define TRAVERSE_PREFETCH_DATA (1<<3)
#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define TRAVERSE_HARD (1<<4)
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
- int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+/* Special traverse error return value to indicate skipping of children */
+#define TRAVERSE_VISIT_NO_CHILDREN -1
+
+int traverse_dataset(struct dsl_dataset *ds,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
index 6aaf35dc038f..bbc66347d525 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMU_TX_H
#define _SYS_DMU_TX_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/txg.h>
#include <sys/refcount.h>
@@ -58,6 +56,7 @@ struct dmu_tx {
txg_handle_t tx_txgh;
void *tx_tempreserve_cookie;
struct dmu_tx_hold *tx_needassign_txh;
+ list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
uint8_t tx_anyobj;
int tx_err;
#ifdef ZFS_DEBUG
@@ -77,6 +76,7 @@ enum dmu_tx_hold_type {
THT_FREE,
THT_ZAP,
THT_SPACE,
+ THT_SPILL,
THT_NUMTYPES
};
@@ -97,6 +97,11 @@ typedef struct dmu_tx_hold {
#endif
} dmu_tx_hold_t;
+typedef struct dmu_tx_callback {
+ list_node_t dcb_node; /* linked to tx_callbacks list */
+ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */
+ void *dcb_data; /* caller private data */
+} dmu_tx_callback_t;
/*
* These routines are defined in dmu.h, and are called by the user.
@@ -108,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx);
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
/*
* These routines are defined in dmu_spa.h, and are called by the SPA.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 48e4da8cd647..9ad4be36bf85 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DNODE_H
@@ -33,6 +32,7 @@
#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -63,6 +63,18 @@ extern "C" {
#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define DN_ID_CHKED_BONUS 0x1
+#define DN_ID_CHKED_SPILL 0x2
+#define DN_ID_OLD_EXIST 0x4
+#define DN_ID_NEW_EXIST 0x8
+
+/*
* Derived constants.
*/
#define DNODE_SIZE (1 << DNODE_SHIFT)
@@ -70,10 +82,12 @@ extern "C" {
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
+#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
/* The +2 here is a cheesy way to round up */
#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
@@ -88,7 +102,7 @@ extern "C" {
#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
struct dmu_buf_impl;
-struct objset_impl;
+struct objset;
struct zio;
enum dnode_dirtycontext {
@@ -101,6 +115,9 @@ enum dnode_dirtycontext {
#define DNODE_FLAG_USED_BYTES (1<<0)
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
@@ -121,7 +138,8 @@ typedef struct dnode_phys {
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
} dnode_phys_t;
typedef struct dnode {
@@ -136,9 +154,10 @@ typedef struct dnode {
list_node_t dn_link;
/* immutable: */
- struct objset_impl *dn_objset;
+ struct objset *dn_objset;
uint64_t dn_object;
struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
@@ -155,15 +174,21 @@ typedef struct dnode {
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint8_t dn_next_bonustype[TXG_SIZE];
+ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
@@ -183,33 +208,60 @@ typedef struct dnode {
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ list_t dn_dbufs; /* descendent dbufs */
+
+ /* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+ boolean_t dn_have_spill; /* have spill or are spilling */
+
/* parent IO for current sync write */
zio_t *dn_zio;
/* used in syncing context */
- dnode_phys_t *dn_oldphys;
+ uint64_t dn_oldused; /* old phys used bytes */
+ uint64_t dn_oldflags; /* old phys dn_flags */
+ uint64_t dn_olduid, dn_oldgid;
+ uint64_t dn_newuid, dn_newgid;
+ int dn_id_flags;
/* holds prefetch structure */
struct zfetch dn_zfetch;
} dnode_t;
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
typedef struct free_range {
avl_node_t fr_node;
uint64_t fr_blkid;
uint64_t fr_nblks;
} free_range_t;
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
- uint64_t object);
-void dnode_special_close(dnode_t *dn);
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-int dnode_hold(struct objset_impl *dd, uint64_t object,
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index a1c2896e3cfb..22733d070e8b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DATASET_H
@@ -33,6 +32,7 @@
#include <sys/bplist.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
#ifdef __cplusplus
extern "C" {
@@ -42,8 +42,6 @@ struct dsl_dataset;
struct dsl_dir;
struct dsl_pool;
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
#define DS_FLAG_INCONSISTENT (1ULL<<0)
#define DS_IS_INCONSISTENT(ds) \
((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
@@ -63,6 +61,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
+#define DS_IS_DEFER_DESTROY(ds) \
+ ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -77,7 +83,7 @@ typedef struct dsl_dataset_phys {
uint64_t ds_num_children; /* clone/snap children; ==0 for head */
uint64_t ds_creation_time; /* seconds since 1970 */
uint64_t ds_creation_txg;
- uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */
+ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
uint64_t ds_used_bytes;
uint64_t ds_compressed_bytes;
uint64_t ds_uncompressed_bytes;
@@ -93,7 +99,8 @@ typedef struct dsl_dataset_phys {
blkptr_t ds_bp;
uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
- uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
+ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
+ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
@@ -106,10 +113,13 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
- uint64_t ds_origin_txg;
/* has internal locking: */
- bplist_t ds_deadlist;
+ dsl_deadlist_t ds_deadlist;
+ bplist_t ds_pending_deadlist;
+
+ /* to protect against multiple concurrent incremental recv */
+ kmutex_t ds_recvlock;
/* protected by lock on pool's dp_dirty_datasets list */
txg_node_t ds_dirty_link;
@@ -120,8 +130,8 @@ typedef struct dsl_dataset {
* Protected by ds_lock:
*/
kmutex_t ds_lock;
- void *ds_user_ptr;
- dsl_dataset_evict_func_t *ds_user_evict_func;
+ objset_t *ds_objset;
+ uint64_t ds_userrefs;
/*
* ds_owner is protected by the ds_rwlock and the ds_lock
@@ -143,7 +153,32 @@ typedef struct dsl_dataset {
char ds_snapname[MAXNAMELEN];
} dsl_dataset_t;
-#define dsl_dataset_is_snapshot(ds) \
+struct dsl_ds_destroyarg {
+ dsl_dataset_t *ds; /* ds to destroy */
+ dsl_dataset_t *rm_origin; /* also remove our origin? */
+ boolean_t is_origin_rm; /* set if removing origin snap */
+ boolean_t defer; /* destroy -d requested? */
+ boolean_t releasing; /* destroying due to release? */
+ boolean_t need_prep; /* do we need to retry due to EBUSY? */
+};
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define MAX_TAG_PREFIX_LEN 17
+
+struct dsl_ds_holdarg {
+ dsl_sync_task_group_t *dstg;
+ char *htag;
+ char *snapname;
+ boolean_t recursive;
+ boolean_t gotone;
+ boolean_t temphold;
+ char failed[MAXPATHLEN];
+};
+
+#define dsl_dataset_is_snapshot(ds) \
((ds)->ds_phys->ds_num_children != 0)
#define DS_UNIQUE_IS_ACCURATE(ds) \
@@ -152,36 +187,43 @@ typedef struct dsl_dataset {
int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, int flags, void *owner,
- dsl_dataset_t **dsp);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp);
int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
- int flags, void *owner, dsl_dataset_t **);
+ boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
- void *owner);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+ void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
dsl_checkfunc_t dsl_dataset_destroy_check;
dsl_syncfunc_t dsl_dataset_destroy_sync;
dsl_checkfunc_t dsl_dataset_snapshot_check;
dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
+dsl_syncfunc_t dsl_dataset_user_hold_sync;
int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
boolean_t force);
-
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive, boolean_t temphold, int cleanup_fd);
+int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+ char *htag, boolean_t retry);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
@@ -192,10 +234,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
-boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+ uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@@ -211,13 +255,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
uint64_t asize, uint64_t inflight, uint64_t *used,
uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
-void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
-int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
- dmu_tx_t *tx);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+ uint64_t quota);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
#ifdef ZFS_DEBUG
#define dprintf_ds(ds, fmt, ...) do { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
new file mode 100644
index 000000000000..d2c16d72c17e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DEADLIST_H
+#define _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+ objset_t *dl_os;
+ uint64_t dl_object;
+ avl_tree_t dl_tree;
+ boolean_t dl_havetree;
+ struct dmu_buf *dl_dbuf;
+ dsl_deadlist_phys_t *dl_phys;
+ kmutex_t dl_lock;
+
+ /* if it's the old on-disk format: */
+ bpobj_t dl_bpobj;
+ boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+ avl_node_t dle_node;
+ uint64_t dle_mintxg;
+ bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+ uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
index b064c9228ec8..73c43bd23879 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
@@ -53,6 +52,9 @@ extern "C" {
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
/*
* Note: the names of properties that are marked delegatable are also
@@ -62,6 +64,7 @@ extern "C" {
int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
index 56d06388cc72..2191635dd813 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
@@ -70,7 +69,8 @@ typedef struct dsl_dir_phys {
uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
uint64_t dd_flags;
uint64_t dd_used_breakdown[DD_USED_NUM];
- uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+ uint64_t dd_clones; /* dsl_dir objects */
+ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
} dsl_dir_phys_t;
struct dsl_dir {
@@ -89,6 +89,8 @@ struct dsl_dir {
/* Protected by dd_lock */
kmutex_t dd_lock;
list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ uint64_t dd_origin_txg;
/* gross estimate of space used by in-flight tx's */
uint64_t dd_tempreserved[TXG_SIZE];
@@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+ uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation);
int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
+#define XLATION_DIR_NAME "$XLATION"
+#define FREE_DIR_NAME "$FREE"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
index d8da295f3386..7d25bd7c020d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_POOL_H
@@ -32,6 +31,9 @@
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -42,12 +44,7 @@ struct dsl_dir;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
-
-enum scrub_func {
- SCRUB_FUNC_NONE,
- SCRUB_FUNC_CLEAN,
- SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;
/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
@@ -75,6 +72,7 @@ typedef struct dsl_pool {
struct objset *dp_meta_objset;
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
+ struct dsl_dir *dp_free_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
@@ -83,25 +81,18 @@ typedef struct dsl_pool {
blkptr_t dp_meta_rootbp;
list_t dp_synced_datasets;
hrtime_t dp_read_overhead;
- uint64_t dp_throughput;
+ uint64_t dp_throughput; /* bytes per millisec */
uint64_t dp_write_limit;
+ uint64_t dp_tmp_userrefs_obj;
+ bpobj_t dp_free_bpobj;
+
+ struct dsl_scan *dp_scan;
/* Uses dp_lock */
kmutex_t dp_lock;
uint64_t dp_space_towrite[TXG_SIZE];
uint64_t dp_tempreserved[TXG_SIZE];
- enum scrub_func dp_scrub_func;
- uint64_t dp_scrub_queue_obj;
- uint64_t dp_scrub_min_txg;
- uint64_t dp_scrub_max_txg;
- zbookmark_t dp_scrub_bookmark;
- boolean_t dp_scrub_pausing;
- boolean_t dp_scrub_isresilver;
- uint64_t dp_scrub_start_time;
- kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
- boolean_t dp_scrub_restart;
-
/* Has its own locking */
tx_state_t dp_tx;
txg_list_t dp_dirty_datasets;
@@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
void dsl_pool_close(dsl_pool_t *dp);
dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_memory_pressure(dsl_pool_t *dp);
void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
- struct dmu_tx *tx);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+ const blkptr_t *bpp);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
index 26018a46d1b2..a636ad35096b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_PROP_H
@@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record {
void *cbr_arg;
} dsl_prop_cb_record_t;
+typedef struct dsl_props_arg {
+ nvlist_t *pa_props;
+ zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+ const char *psa_name;
+ zprop_source_t psa_source;
+ int psa_intsz;
+ int psa_numints;
+ const void *psa_value;
+
+ /*
+ * Used to handle the special requirements of the quota and reservation
+ * properties.
+ */
+ uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg);
int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
@@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname,
int intsz, int numints, void *buf, char *setpoint);
int dsl_prop_get_integer(const char *ddname, const char *propname,
uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
int intsz, int numints, void *buf, char *setpoint);
int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
- int intsz, int numints, void *buf, char *setpoint);
+ int intsz, int numints, void *buf, char *setpoint,
+ boolean_t snapshot);
dsl_syncfunc_t dsl_props_set_sync;
int dsl_prop_set(const char *ddname, const char *propname,
- int intsz, int numints, const void *buf);
-int dsl_props_set(const char *dsname, nvlist_t *nvl);
-void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx);
+ zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ dmu_tx_t *tx);
+
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+ zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) \
+ dsl_prop_check_prediction((dd), (psa))
+#else
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);
void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
void dsl_prop_nvlist_add_string(nvlist_t *nv,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 000000000000..c79666e67de0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SCAN_H
+#define _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+ uint64_t scn_func; /* pool_scan_func_t */
+ uint64_t scn_state; /* dsl_scan_state_t */
+ uint64_t scn_queue_obj;
+ uint64_t scn_min_txg;
+ uint64_t scn_max_txg;
+ uint64_t scn_cur_min_txg;
+ uint64_t scn_cur_max_txg;
+ uint64_t scn_start_time;
+ uint64_t scn_end_time;
+ uint64_t scn_to_examine; /* total bytes to be scanned */
+ uint64_t scn_examined; /* bytes scanned so far */
+ uint64_t scn_to_process;
+ uint64_t scn_processed;
+ uint64_t scn_errors; /* scan I/O error count */
+ uint64_t scn_ddt_class_max;
+ ddt_bookmark_t scn_ddt_bookmark;
+ zbookmark_t scn_bookmark;
+ uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+ DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+ struct dsl_pool *scn_dp;
+
+ boolean_t scn_pausing;
+ uint64_t scn_restart_txg;
+ uint64_t scn_sync_start_time;
+ zio_t *scn_zio_root;
+
+ /* for debugging / information */
+ uint64_t scn_visited_this_txg;
+
+ dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
index 4995bfe5acca..9126290cdb5b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
#define _SYS_DSL_SYNCTASK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/txg.h>
#include <sys/zfs_context.h>
@@ -38,7 +35,7 @@ extern "C" {
struct dsl_pool;
typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
typedef struct dsl_sync_task {
list_node_t dst_node;
@@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
txg_node_t dstg_node;
list_t dstg_tasks;
struct dsl_pool *dstg_pool;
- cred_t *dstg_cr;
uint64_t dstg_txg;
int dstg_err;
int dstg_space;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index c77b77205490..583d6303bd5a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -36,9 +35,6 @@
extern "C" {
#endif
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
extern space_map_ops_t *zfs_metaslab_ops;
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@@ -58,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+ space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+ int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
vdev_t *vd);
extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 5f0b77086b03..07988dd51a73 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -37,9 +37,14 @@ extern "C" {
#endif
struct metaslab_class {
+ spa_t *mc_spa;
metaslab_group_t *mc_rotor;
- uint64_t mc_allocated;
space_map_ops_t *mc_ops;
+ uint64_t mc_aliquot;
+ uint64_t mc_alloc; /* total allocated space */
+ uint64_t mc_deferred; /* total deferred frees */
+ uint64_t mc_space; /* total space (alloc + free) */
+ uint64_t mc_dspace; /* total deflated space */
};
struct metaslab_group {
@@ -48,6 +53,7 @@ struct metaslab_group {
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
int64_t mg_bias;
+ int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
metaslab_group_t *mg_prev;
@@ -67,7 +73,9 @@ struct metaslab {
space_map_obj_t ms_smo_syncing; /* syncing space map object */
space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */
space_map_t ms_map; /* in-core free space map */
+ int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
index e84b1bf65f99..37a28b8c14b9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
#define _SYS_REFCOUNT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cdefs.h>
#include <sys/types.h>
#include_next <sys/refcount.h>
@@ -45,7 +42,7 @@ extern "C" {
*/
#define FTAG ((char *)__func__)
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
typedef struct reference {
list_node_t ref_link;
void *ref_holder;
@@ -72,11 +69,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag);
int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_sysinit(void);
void refcount_fini(void);
-#else /* DEBUG */
+#else /* ZFS_DEBUG */
typedef struct refcount {
uint64_t rc_count;
@@ -93,11 +91,16 @@ typedef struct refcount {
atomic_add_64_nv(&(rc)->rc_count, number)
#define refcount_remove_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, -number)
+#define refcount_transfer(dst, src) { \
+ uint64_t __tmp = (src)->rc_count; \
+ atomic_add_64(&(src)->rc_count, -__tmp); \
+ atomic_add_64(&(dst)->rc_count, __tmp); \
+}
#define refcount_sysinit()
#define refcount_fini()
-#endif /* DEBUG */
+#endif /* ZFS_DEBUG */
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
new file mode 100644
index 000000000000..e12520105bc2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_H
+#define _SYS_SA_H
+
+#include <sys/dmu.h>
+#include <sys/uio.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+ char *sa_name; /* attribute name */
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap; /* bswap functon enum */
+ sa_attr_type_t sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+ boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+ void *sa_data;
+ sa_data_locator_t *sa_data_func;
+ uint16_t sa_length;
+ sa_attr_type_t sa_attr;
+ /* the following are private to the sa framework */
+ void *sa_addr;
+ uint16_t sa_buftype;
+ uint16_t sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+ b[idx].sa_attr = attr;\
+ b[idx].sa_data_func = func; \
+ b[idx].sa_data = data; \
+ b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+ SA_HDL_SHARED,
+ SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+ uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
new file mode 100644
index 000000000000..6661e47cfc83
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+ sa_attr_type_t sa_attr;
+ uint8_t sa_registered;
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap;
+ char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | unused | len | bswap | attr num |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16 0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ * ......
+ *
+ */
+
+#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define TOC_OFF(x) BF32_GET(x, 0, 23)
+#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
+#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
+#define TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+ BF32_SET(x, 31, 1, 1); \
+ BF32_SET(x, 24, 7, len_idx); \
+ BF32_SET(x, 0, 24, offset); \
+}
+
+#define SA_LAYOUTS "LAYOUTS"
+#define SA_REGISTRY "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+ avl_node_t lot_num_node;
+ avl_node_t lot_hash_node;
+ uint64_t lot_num;
+ uint64_t lot_hash;
+ sa_attr_type_t *lot_attrs; /* array of attr #'s */
+ uint32_t lot_var_sizes; /* how many aren't fixed size */
+ uint32_t lot_attr_count; /* total attr count */
+ list_t lot_idx_tab; /* should be only a couple of entries */
+ int lot_instance; /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+ list_node_t sa_next;
+ sa_lot_t *sa_layout;
+ uint16_t *sa_variable_lengths;
+ refcount_t sa_refcount;
+ uint32_t *sa_idx_tab; /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read. The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout. Both
+ * of these tree's are interconnected. The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+ kmutex_t sa_lock;
+ boolean_t sa_need_attr_registration;
+ boolean_t sa_force_spill;
+ uint64_t sa_master_obj;
+ uint64_t sa_reg_attr_obj;
+ uint64_t sa_layout_attr_obj;
+ int sa_num_attrs;
+ sa_attr_table_t *sa_attr_table; /* private attr table */
+ sa_update_cb_t *sa_update_cb;
+ avl_tree_t sa_layout_num_tree; /* keyed by layout number */
+ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
+ int sa_user_table_sz;
+ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+ SA_BONUS = 1,
+ SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+ SA_LOOKUP,
+ SA_UPDATE,
+ SA_ADD,
+ SA_REPLACE,
+ SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+ kmutex_t sa_lock;
+ dmu_buf_t *sa_bonus;
+ dmu_buf_t *sa_spill;
+ objset_t *sa_os;
+ void *sa_userp;
+ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
+ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
+};
+
+#define SA_GET_DB(hdl, type) \
+ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define SA_GET_HDR(hdl, type) \
+ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+ type))->db.db_data))
+
+#define SA_IDX_TAB_GET(hdl, type) \
+ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define IS_SA_BONUSTYPE(a) \
+ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define SA_BONUSTYPE_FROM_DB(db) \
+ (dmu_get_bonustype((dmu_buf_t *)db))
+
+#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define SA_LAYOUT_NUM(x, type) \
+ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+ SA_REGISTERED_LEN(sa, attr))
+
+#define SA_SET_HDR(hdr, num, size) \
+ { \
+ hdr->sa_magic = SA_MAGIC; \
+ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+ }
+
+#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+ { \
+ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+ bulk.sa_buftype = type; \
+ bulk.sa_addr = \
+ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+ (uintptr_t)hdr); \
+}
+
+#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+ sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+ uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+ uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index f54a5dc52f23..23d48c8a9e80 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -43,8 +42,13 @@ extern "C" {
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
struct dsl_pool;
/*
@@ -134,15 +138,15 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
+ * 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
+ * a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +170,29 @@ typedef struct zio_cksum {
* cksum checksum function
* comp compression function
* G gang block indicator
- * E endianness
- * type DMU object type
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
* lvl level of indirection
- * birth txg transaction group in which the block was born
+ * type DMU object type
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
/*
* Macros to get and set fields in a bp or DVA.
*/
@@ -209,7 +217,6 @@ typedef struct blkptr {
#define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
@@ -218,20 +225,35 @@ typedef struct blkptr {
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -239,7 +261,7 @@ typedef struct blkptr {
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@@ -255,6 +277,12 @@ typedef struct blkptr {
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +302,10 @@ typedef struct blkptr {
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \
{ \
@@ -287,14 +318,12 @@ typedef struct blkptr {
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
+ (bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
-#define BLK_FILL_ALREADY_FREED (-1ULL)
-
/*
* Note: the byteorder is either 0 or -1, both of which are palindromes.
* This simplifies the endianness handling a bit.
@@ -309,27 +338,88 @@ typedef struct blkptr {
#define BP_SPRINTF_LEN 320
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+{ \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int size = BP_SPRINTF_LEN; \
+ int len = 0; \
+ int copies = 0; \
+ \
+ if (bp == NULL) { \
+ len = func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len = func(buf + len, size - len, "<hole>"); \
+ } else { \
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)bp->blk_fill, \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+}
+
#include <sys/dmu.h>
#define BP_GET_BUFC_TYPE(bp) \
(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
+
+typedef enum spa_import_type {
+ SPA_IMPORT_EXISTING,
+ SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+ nvlist_t *policy, nvlist_t **config);
extern int spa_get_stats(const char *pool, nvlist_t **config,
char *altroot, size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
const char *history_str, nvlist_t *zplprops);
-extern int spa_check_rootconf(char *devpath, char *devid,
- nvlist_t **bestconf, uint64_t *besttxg);
-extern boolean_t spa_rootdev_validate(nvlist_t *nv);
extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+ uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -341,12 +431,23 @@ extern void spa_async_suspend(spa_t *spa);
extern void spa_async_resume(spa_t *spa);
extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define SPA_REMOVE_UNSPARE 0x01
+#define SPA_REMOVE_DONE 0x02
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -355,8 +456,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp);
/* spare state (which is global across all pools) */
extern void spa_spare_add(vdev_t *vd);
@@ -370,15 +474,23 @@ extern void spa_l2cache_remove(vdev_t *vd);
extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
extern void spa_l2cache_activate(vdev_t *vd);
extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
/* spa syncing */
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
extern void spa_sync_allpools(void);
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred. XXX so can't we change it back to 1?
+ */
+#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */
+#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
+#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
+
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
@@ -396,7 +508,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config);
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
int getstats);
extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
/*
* Miscellaneous SPA routines in spa_misc.c
@@ -404,7 +515,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
@@ -413,6 +524,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
+#define SCL_NONE 0x00
#define SCL_CONFIG 0x01
#define SCL_STATE 0x02
#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
@@ -432,12 +544,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+/* Log state */
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
/* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -449,36 +579,49 @@ extern char *spa_name(spa_t *spa);
extern uint64_t spa_guid(spa_t *spa);
extern uint64_t spa_last_synced_txg(spa_t *spa);
extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
extern int spa_busy(void);
extern uint8_t spa_get_failmode(spa_t *spa);
extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
boolean_t l2cache);
extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
+
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
#define strtonum(str, nptr) zfs_strtonum((str), (nptr))
@@ -491,10 +634,11 @@ typedef enum history_log_type {
} history_log_type_t;
typedef struct history_arg {
- const char *ha_history_str;
+ char *ha_history_str;
history_log_type_t ha_log_type;
history_internal_events_t ha_event;
- char ha_zone[MAXPATHLEN];
+ char *ha_zone;
+ uid_t ha_uid;
} history_arg_t;
extern char *spa_his_ievent_table[];
@@ -504,16 +648,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
char *his_buf);
extern int spa_history_log(spa_t *spa, const char *his_buf,
history_log_type_t what);
-void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+ spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
/* error handling */
struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
- struct zio *zio, uint64_t stateoroffset, uint64_t length);
+ zio_t *zio, uint64_t stateoroffset, uint64_t length);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
extern uint64_t spa_get_errlog_size(spa_t *spa);
extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
@@ -544,7 +689,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ sprintf_blkptr(__blkbuf, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
index b56073b97516..1d3622f5a108 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SPA_BOOT_H
#define _SYS_SPA_BOOT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#ifdef __cplusplus
@@ -36,7 +34,6 @@ extern "C" {
extern char *spa_get_bootprop(char *prop);
extern void spa_free_bootprop(char *prop);
-extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index ecb065c3f98c..a2f15d2863fa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -36,6 +35,7 @@
#include <sys/avl.h>
#include <sys/refcount.h>
#include <sys/bplist.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -78,13 +78,6 @@ typedef struct spa_config_dirent {
char *scd_path;
} spa_config_dirent_t;
-typedef enum spa_log_state {
- SPA_LOG_UNKNOWN = 0, /* unknown log state */
- SPA_LOG_MISSING, /* missing log(s) */
- SPA_LOG_CLEAR, /* clear the log(s) */
- SPA_LOG_GOOD, /* log(s) are good */
-} spa_log_state_t;
-
enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH,
@@ -93,6 +86,25 @@ enum zio_taskq_type {
ZIO_TASKQ_TYPES
};
+/*
+ * State machine for the zpool-pooname process. The states transitions
+ * are done as follows:
+ *
+ * From To Routine
+ * PROC_NONE -> PROC_CREATED spa_activate()
+ * PROC_CREATED -> PROC_ACTIVE spa_thread()
+ * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
+ * PROC_DEACTIVATE -> PROC_GONE spa_thread()
+ * PROC_GONE -> PROC_NONE spa_deactivate()
+ */
+typedef enum spa_proc_state {
+ SPA_PROC_NONE, /* spa_proc = &p0, no process created */
+ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
+ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
+ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
+ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -101,13 +113,15 @@ struct spa {
avl_node_t spa_avl; /* node in spa_namespace_avl */
nvlist_t *spa_config; /* last synced config */
nvlist_t *spa_config_syncing; /* currently syncing config */
+ nvlist_t *spa_config_splitting; /* config for splitting */
+ nvlist_t *spa_load_info; /* info and errors from load */
uint64_t spa_config_txg; /* txg of last config change */
int spa_sync_pass; /* iterate-to-convergence */
pool_state_t spa_state; /* pool state */
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
- boolean_t spa_load_verbatim; /* load the given config? */
+ uint64_t spa_import_flags; /* import specific flags */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
metaslab_class_t *spa_normal_class; /* normal data class */
@@ -115,6 +129,9 @@ struct spa {
uint64_t spa_first_txg; /* first txg after spa_open() */
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ uint64_t spa_load_max_txg; /* best initial ub_txg */
+ uint64_t spa_claim_max_txg; /* highest claimed birth txg */
+ timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
@@ -124,21 +141,24 @@ struct spa {
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
- uint64_t spa_sync_bplist_obj; /* object for deferred frees */
- bplist_t spa_sync_bplist; /* deferred-free bplist */
+ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
+ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
+ boolean_t spa_extreme_rewind; /* rewind past deferred frees */
+ uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
- uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
- uint64_t spa_scrub_errors; /* scrub I/O error count */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
uint8_t spa_scrub_finished; /* indicator to rotate logs */
uint8_t spa_scrub_started; /* started since last boot */
uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
+ uint64_t spa_scan_pass_start; /* start time per pass/reboot */
+ uint64_t spa_scan_pass_exam; /* examined bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
@@ -146,7 +166,14 @@ struct spa {
uint16_t spa_async_tasks; /* async task mask */
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
- boolean_t spa_last_open_failed; /* true if last open faled */
+ int spa_last_open_failed; /* error if last open failed */
+ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
+ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_txg; /* ub txg that loaded */
+ uint64_t spa_load_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_meta_errors; /* verify metadata err count */
+ uint64_t spa_load_data_errors; /* verify data err count */
+ uint64_t spa_verify_min_txg; /* start txg of verify scrub */
kmutex_t spa_errlog_lock; /* error log lock */
uint64_t spa_errlog_last; /* last error log object */
uint64_t spa_errlog_scrub; /* scrub error log object */
@@ -168,10 +195,27 @@ struct spa {
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
+ uint8_t spa_claiming; /* pool is doing zil_claim() */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */
+ uint64_t spa_autoexpand; /* lun expansion on/off */
+ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+ uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_ditto; /* dedup ditto threshold */
+ uint64_t spa_dedup_checksum; /* default dedup checksum */
+ uint64_t spa_dspace; /* dspace in normal class */
+ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
+ kmutex_t spa_proc_lock; /* protects spa_proc* */
+ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
+ spa_proc_state_t spa_proc_state; /* see definition */
+ struct proc *spa_proc; /* "zpool-poolname" process */
+ uint64_t spa_did; /* if procp != p0, did of t1 */
+ boolean_t spa_autoreplace; /* autoreplace set in open */
+ int spa_vdev_locks; /* locks grabbed */
+ uint64_t spa_creation_version; /* version at pool creation */
+ uint64_t spa_prev_software_version;
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
@@ -180,16 +224,13 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
refcount_t spa_refcount; /* number of opens */
+#ifndef sun
+ boolean_t spa_splitting_newspa; /* creating new spa in split */
+#endif
};
extern const char *spa_config_path;
-#define BOOTFS_COMPRESS_VALID(compress) \
- ((compress) == ZIO_COMPRESS_LZJB || \
- ((compress) == ZIO_COMPRESS_ON && \
- ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
- (compress) == ZIO_COMPRESS_OFF)
-
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
index 23bdff211b4a..e323d5efabb7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_TXG_H
#define _SYS_TXG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/zfs_context.h>
@@ -41,6 +39,9 @@ extern "C" {
#define TXG_INITIAL TXG_SIZE /* initial txg */
#define TXG_IDX (txg & TXG_MASK)
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define TXG_DEFER_SIZE 2
+
#define TXG_WAIT 1ULL
#define TXG_NOWAIT 2ULL
@@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp);
extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
extern void txg_rele_to_quiesce(txg_handle_t *txghp);
extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
/*
* Delay the caller by the specified number of ticks or until
@@ -117,6 +117,7 @@ extern void txg_list_create(txg_list_t *tl, size_t offset);
extern void txg_list_destroy(txg_list_t *tl);
extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index 7413c662b355..7b356eac1293 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,13 +37,13 @@ struct tx_cpu {
kmutex_t tc_lock;
kcondvar_t tc_cv[TXG_SIZE];
uint64_t tc_count[TXG_SIZE];
+ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[16];
};
typedef struct tx_state {
tx_cpu_t *tx_cpu; /* protects right to enter txg */
kmutex_t tx_sync_lock; /* protects tx_state_t */
- krwlock_t tx_suspend;
uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
@@ -64,6 +64,8 @@ typedef struct tx_state {
kthread_t *tx_sync_thread;
kthread_t *tx_quiesce_thread;
+
+ taskq_t *tx_commit_cb_taskq; /* commit callback taskq */
} tx_state_t;
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
index 93d936ae4b18..b5bb91573145 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,19 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_UBERBLOCK_H
#define _SYS_UBERBLOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/vdev.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#ifdef __cplusplus
extern "C" {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
index b49df8ae0ce3..6ab6aa3135a2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -33,11 +32,6 @@ extern "C" {
#endif
/*
- * For zdb use and debugging purposes only
- */
-extern uint64_t ub_max_txg;
-
-/*
* The uberblock version is incremented whenever an incompatible on-disk
* format change is made to the SPA, DMU, or ZAP.
*
@@ -57,6 +51,9 @@ struct uberblock {
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
+
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
};
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index 933255464261..941f234dc68f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -47,10 +46,11 @@ typedef enum vdev_dtl_type {
extern boolean_t zfs_nocacheflush;
extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
extern int vdev_validate(vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
@@ -69,26 +69,31 @@ extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
+
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
- boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
- int64_t alloc_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-extern int vdev_fault(spa_t *spa, uint64_t guid);
-extern int vdev_degrade(spa_t *spa, uint64_t guid);
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_state_t *);
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
@@ -119,8 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+typedef enum vdev_config_flag {
+ VDEV_CONFIG_SPARE = 1 << 0,
+ VDEV_CONFIG_L2CACHE = 1 << 1,
+ VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
- boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+ boolean_t getstats, vdev_config_flag_t flags);
/*
* Label routines
@@ -136,7 +148,8 @@ typedef enum {
VDEV_LABEL_REPLACE, /* replace an existing device */
VDEV_LABEL_SPARE, /* add a new hot spare */
VDEV_LABEL_REMOVE, /* remove an existing device */
- VDEV_LABEL_L2CACHE /* add an L2ARC cache device */
+ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
+ VDEV_LABEL_SPLIT /* generating new label for split-off dev */
} vdev_labeltype_t;
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 93e410250880..7efa3f3b13cf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -62,6 +61,8 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef int vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef void vdev_hold_func_t(vdev_t *vd);
+typedef void vdev_rele_func_t(vdev_t *vd);
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -70,6 +71,8 @@ typedef struct vdev_ops {
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
+ vdev_hold_func_t *vdev_op_hold;
+ vdev_rele_func_t *vdev_op_rele;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -112,19 +115,28 @@ struct vdev {
uint64_t vdev_id; /* child number in vdev parent */
uint64_t vdev_guid; /* unique ID for this vdev */
uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_orig_guid; /* orig. guid prior to remove */
uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
vdev_ops_t *vdev_ops; /* vdev operations */
spa_t *vdev_spa; /* spa for this vdev */
void *vdev_tsd; /* type-specific data */
+ vnode_t *vdev_name_vp; /* vnode for pathname */
+ vnode_t *vdev_devid_vp; /* vnode for devid */
vdev_t *vdev_top; /* top-level vdev */
vdev_t *vdev_parent; /* parent vdev */
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */
+ boolean_t vdev_expanding; /* expand the vdev? */
+ boolean_t vdev_reopening; /* reopen in progress? */
+ int vdev_open_error; /* error on last open */
+ kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
/*
* Top-level vdev state.
@@ -139,10 +151,12 @@ struct vdev {
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
+ uint64_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_ishole; /* is a hole in the namespace */
/*
* Leaf vdev state.
@@ -155,6 +169,7 @@ struct vdev {
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
+ uint64_t vdev_resilvering; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
@@ -166,6 +181,8 @@ struct vdev {
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
+ boolean_t vdev_splitting; /* split or repair in progress */
+ boolean_t vdev_delayed_close; /* delayed device close? */
uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
uint8_t vdev_detached; /* device detached? */
uint8_t vdev_cant_read; /* vdev is failing all reads */
@@ -176,6 +193,7 @@ struct vdev {
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
+ vdev_aux_t vdev_label_aux; /* on-disk aux state */
/*
* For DTrace to work in userland (libzpool) context, these fields must
@@ -189,6 +207,8 @@ struct vdev {
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
};
+#define VDEV_RAIDZ_MAXPARITY 3
+
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
@@ -204,8 +224,8 @@ struct vdev {
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
@@ -239,10 +259,14 @@ typedef struct vdev_label {
#define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2
#define VDEV_ALLOC_L2CACHE 3
+#define VDEV_ALLOC_ROOTPOOL 4
+#define VDEV_ALLOC_SPLIT 5
/*
* Allocate or free a vdev
*/
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
vdev_t *parent, uint_t id, int alloctype);
extern void vdev_free(vdev_t *vd);
@@ -259,7 +283,8 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -279,13 +304,15 @@ extern vdev_ops_t vdev_disk_ops;
#endif
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
/*
* Common size functions
*/
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
/*
* zdb uses this tunable, so it must be declared here to make lint happy.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
index ea3a0f632055..a1130bbbaaae 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_H
#define _SYS_ZAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* ZAP - ZFS Attribute Processor
*
@@ -87,9 +84,6 @@
extern "C" {
#endif
-#define ZAP_MAXNAMELEN 256
-#define ZAP_MAXVALUELEN 1024
-
/*
* The matchtype specifies which entry will be accessed.
* MT_EXACT: only find an exact match (non-normalized)
@@ -106,6 +100,18 @@ typedef enum matchtype
MT_FIRST
} matchtype_t;
+typedef enum zap_flags {
+ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+ ZAP_FLAG_HASH64 = 1 << 0,
+ /* Key is binary, not string (zap_add_uint64() can be used) */
+ ZAP_FLAG_UINT64_KEY = 1 << 1,
+ /*
+ * First word of key (which must be an array of uint64) is
+ * already randomly distributed.
+ */
+ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
/*
* Create a new zapobj with no attributes and return its object number.
* MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -123,6 +129,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
/*
* Create a new zapobj with no attributes from the given (unallocated)
@@ -185,6 +194,11 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
int add, uint64_t *towrite, uint64_t *tooverwrite);
@@ -195,9 +209,12 @@ int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
* If an attribute with the given name already exists, the call will
* fail and return EEXIST.
*/
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
/*
* Set the attribute with the given name to the given value. If an
@@ -209,6 +226,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
/*
* Get the length (in integers) and the integer size of the specified
@@ -219,6 +239,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers);
/*
* Remove the specified attribute.
@@ -229,6 +251,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx);
/*
* Returns (in *count) the number of attributes in the specified zap
@@ -236,7 +260,6 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
/*
* Returns (in name) the name of the entry whose (value & mask)
* (za_first_integer) is value, or ENOENT if not found. The string
@@ -253,6 +276,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj,
*/
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx);
+
/*
* Manipulate entries where the name + value are the "same" (the name is
* a stringified version of the value).
@@ -260,6 +291,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta. Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx);
struct zap;
struct zap_leaf;
@@ -269,6 +317,7 @@ typedef struct zap_cursor {
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
+ uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
} zap_cursor_t;
@@ -320,6 +369,11 @@ void zap_cursor_advance(zap_cursor_t *zc);
uint64_t zap_cursor_serialize(zap_cursor_t *zc);
/*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
+/*
* Initialize a zap cursor pointing to the position recorded by
* zap_cursor_serialize (in the "serialized" argument). You can also
* use a "serialized" argument of 0 to start at the beginning of the
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
index c86bb16de268..1dc322e02f6f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -40,13 +39,13 @@ extern int fzap_default_block_shift;
#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
-#define ZAP_MAXCD (uint32_t)(-1)
-#define ZAP_HASHBITS 28
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define ZAP_NEED_CD (-1U)
+
typedef struct mzap_ent_phys {
uint64_t mze_value;
uint32_t mze_cd;
@@ -67,9 +66,11 @@ typedef struct mzap_ent {
avl_node_t mze_node;
int mze_chunkid;
uint64_t mze_hash;
- mzap_ent_phys_t mze_phys;
+ uint32_t mze_cd; /* copy from mze_phys->mze_cd */
} mzap_ent_t;
+#define MZE_PHYS(zap, mze) \
+ (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
/*
* The (fat) zap is stored in one object. It is an array of
@@ -127,6 +128,7 @@ typedef struct zap_phys {
uint64_t zap_num_entries; /* number of entries */
uint64_t zap_salt; /* salt to stir into hash function */
uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
/*
* This structure is followed by padding, and then the embedded
* pointer table. The embedded pointer table takes up second
@@ -168,10 +170,13 @@ typedef struct zap {
typedef struct zap_name {
zap_t *zn_zap;
- const char *zn_name_orij;
+ int zn_key_intlen;
+ const void *zn_key_orig;
+ int zn_key_orig_numints;
+ const void *zn_key_norm;
+ int zn_key_norm_numints;
uint64_t zn_hash;
matchtype_t zn_matchtype;
- const char *zn_name_norm;
char zn_normbuf[ZAP_MAXNAMELEN];
} zap_name_t;
@@ -183,8 +188,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
void zap_unlockdir(zap_t *zap);
void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
@@ -193,6 +201,7 @@ int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp);
+void fzap_prefetch(zap_name_t *zn);
int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
uint64_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
@@ -209,7 +218,8 @@ void zap_put_leaf(struct zap_leaf *l);
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
index 14144e059e54..3a33636741d9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,20 +19,21 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_LEAF_H
#define _SYS_ZAP_LEAF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
+#include <sys/zap.h>
#ifdef __cplusplus
extern "C" {
#endif
struct zap;
+struct zap_name;
+struct zap_stats;
#define ZAP_LEAF_MAGIC 0x2AB1EAF
@@ -129,12 +130,12 @@ typedef struct zap_leaf_phys {
typedef union zap_leaf_chunk {
struct zap_leaf_entry {
uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
- uint8_t le_int_size; /* size of ints */
+ uint8_t le_value_intlen; /* size of value's ints */
uint16_t le_next; /* next entry in hash chain */
uint16_t le_name_chunk; /* first chunk of the name */
- uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_name_numints; /* ints in name (incl null) */
uint16_t le_value_chunk; /* first chunk of the value */
- uint16_t le_value_length; /* value length in ints */
+ uint16_t le_value_numints; /* value length in ints */
uint32_t le_cd; /* collision differentiator */
uint64_t le_hash; /* hash value of the name */
} l_entry;
@@ -177,7 +178,7 @@ typedef struct zap_entry_handle {
* value must equal zap_hash(name).
*/
extern int zap_leaf_lookup(zap_leaf_t *l,
- zap_name_t *zn, zap_entry_handle_t *zeh);
+ struct zap_name *zn, zap_entry_handle_t *zeh);
/*
* Return a handle to the entry with this hash+cd, or the entry with the
@@ -193,10 +194,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l,
* num_integers in the attribute.
*/
extern int zap_entry_read(const zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, void *buf);
+ uint8_t integer_size, uint64_t num_integers, void *buf);
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
- uint16_t buflen, char *buf);
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
/*
* Replace the value of an existing entry.
@@ -204,7 +205,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
* zap_entry_update may fail if it runs out of space (ENOSPC).
*/
extern int zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf);
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
/*
* Remove an entry.
@@ -216,17 +217,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh);
* belong in this leaf (according to its hash value). Fills in the
* entry handle on success. Returns 0 on success or ENOSPC on failure.
*/
-extern int zap_entry_create(zap_leaf_t *l,
- const char *name, uint64_t h, uint32_t cd,
- uint8_t integer_size, uint64_t num_integers, const void *buf,
- zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
/*
* Return true if there are additional entries with the same normalized
* form.
*/
extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
- zap_name_t *zn, const char *name, zap_t *zap);
+ struct zap_name *zn, const char *name, struct zap *zap);
/*
* Other stuff.
@@ -235,7 +235,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+ struct zap_stats *zs);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
index ea1509504305..d3c471a60903 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ACL_H
@@ -32,6 +31,7 @@
#include <sys/acl.h>
#include <sys/dmu.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
@@ -105,12 +105,18 @@ typedef struct zfs_acl_phys_v0 {
#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
+
typedef struct zfs_acl_phys {
uint64_t z_acl_extern_obj; /* ext acl pieces */
uint32_t z_acl_size; /* Number of bytes in ACL */
uint16_t z_acl_version; /* acl version */
uint16_t z_acl_count; /* ace count */
- uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
} zfs_acl_phys_t;
typedef struct acl_ops {
@@ -145,21 +151,26 @@ typedef struct zfs_acl_node {
void *z_allocdata; /* pointer to kmem allocated memory */
size_t z_allocsize; /* Size of blob in bytes */
size_t z_size; /* length of ACL data */
- int z_ace_count; /* number of ACEs in this acl node */
+ uint64_t z_ace_count; /* number of ACEs in this acl node */
int z_ace_idx; /* ace iterator positioned on */
} zfs_acl_node_t;
typedef struct zfs_acl {
- int z_acl_count; /* Number of ACEs */
+ uint64_t z_acl_count; /* Number of ACEs */
size_t z_acl_bytes; /* Number of bytes in ACL */
uint_t z_version; /* version of ACL */
void *z_next_ace; /* pointer to next ACE */
- int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
+ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
list_t z_acl; /* chunks of ACE data */
acl_ops_t z_ops; /* ACL operations */
} zfs_acl_t;
+typedef struct acl_locator_cb {
+ zfs_acl_t *cb_aclp;
+ zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
#define ACL_DATA_ALLOCED 0x1
#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
@@ -206,7 +217,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr);
@@ -214,11 +225,20 @@ void zfs_acl_free(zfs_acl_t *);
int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
struct zfs_fuid_info **, zfs_acl_t **);
int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+ uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
#endif
#ifdef __cplusplus
}
#endif
-
-#endif /* !ZFS_NO_ACL */
+#endif /* _SYS_FS_ZFS_ACL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 952bb24a4567..6dc163d6f530 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZFS_CONTEXT_H
#define _SYS_ZFS_CONTEXT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -54,6 +52,8 @@ extern "C" {
#include <sys/byteorder.h>
#include <sys/systm.h>
#include <sys/list.h>
+#include <sys/zfs_debug.h>
+#include <sys/sysevent.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/time.h>
@@ -83,10 +83,11 @@ extern "C" {
#include <sys/misc.h>
#include <sys/sig.h>
#include <sys/osd.h>
-#include <sys/zfs_debug.h>
+#include <sys/sysevent/dev.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/u8_textprep.h>
#include <sys/fm/util.h>
+#include <sys/sunddi.h>
#include <machine/stdarg.h>
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
index 450ac1c81b42..50ecf9b36249 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
#define _SYS_ZFS_DEBUG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func,
extern void zfs_panic_recover(const char *fmt, ...);
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index bd2c938515ff..349f8ef37321 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FS_ZFS_DIR_H
#define _SYS_FS_ZFS_DIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/pathname.h>
#include <sys/dmu.h>
#include <sys/zfs_znode.h>
@@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
pathname_t *);
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
- uint_t, znode_t **, int, zfs_acl_ids_t *);
+ uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
index c035707c62a6..b381bb98e734 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/zfs_vfsops.h>
#endif
#include <sys/avl.h>
+#include <sys/list.h>
#ifdef __cplusplus
extern "C" {
@@ -100,6 +101,8 @@ typedef struct zfs_fuid_info {
#ifdef _KERNEL
struct znode;
extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+ uint64_t, uint64_t, zfs_fuid_type_t);
extern void zfs_fuid_destroy(zfsvfs_t *);
extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
cred_t *, zfs_fuid_info_t **);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index bf107d605fac..63b9c57eb8a9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -19,19 +19,18 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
#define _SYS_ZFS_IOCTL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cred.h>
#include <sys/dmu.h>
#include <sys/zio.h>
#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zfs_stat.h>
#ifdef _KERNEL
#include <sys/nvpair.h>
@@ -47,26 +46,86 @@ extern "C" {
#define ZFS_SNAPDIR_HIDDEN 0
#define ZFS_SNAPDIR_VISIBLE 1
-#define DMU_BACKUP_STREAM_VERSION (1ULL)
-#define DMU_BACKUP_HEADER_VERSION (2ULL)
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+ DMU_SUBSTREAM = 0x1,
+ DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2)
+#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x)
+
+#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30)
+#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define DMU_BACKUP_FEATURE_DEDUP (0x1)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
+#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
+
+/*
+ * Mask of all supported backup features
+ */
+#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+
+/* Are all features in the given flag word currently supported? */
+#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | reserved | feature-flags |C|S|
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2). Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2. Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
/*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define DRR_CHECKSUM_DEDUP (1<<0)
+
+#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+
+/*
* zfs ioctl command structure
*/
typedef struct dmu_replay_record {
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
- DRR_WRITE, DRR_FREE, DRR_END,
+ DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+ DRR_SPILL, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
struct drr_begin {
uint64_t drr_magic;
- uint64_t drr_version;
+ uint64_t drr_versioninfo; /* was drr_version */
uint64_t drr_creation_time;
dmu_objset_type_t drr_type;
uint32_t drr_flags;
@@ -76,6 +135,7 @@ typedef struct dmu_replay_record {
} drr_begin;
struct drr_end {
zio_cksum_t drr_checksum;
+ uint64_t drr_toguid;
} drr_end;
struct drr_object {
uint64_t drr_object;
@@ -83,14 +143,16 @@ typedef struct dmu_replay_record {
dmu_object_type_t drr_bonustype;
uint32_t drr_blksz;
uint32_t drr_bonuslen;
- uint8_t drr_checksum;
+ uint8_t drr_checksumtype;
uint8_t drr_compress;
uint8_t drr_pad[6];
+ uint64_t drr_toguid;
/* bonus content follows */
} drr_object;
struct drr_freeobjects {
uint64_t drr_firstobj;
uint64_t drr_numobjs;
+ uint64_t drr_toguid;
} drr_freeobjects;
struct drr_write {
uint64_t drr_object;
@@ -98,16 +160,61 @@ typedef struct dmu_replay_record {
uint32_t drr_pad;
uint64_t drr_offset;
uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
/* content follows */
} drr_write;
struct drr_free {
uint64_t drr_object;
uint64_t drr_offset;
uint64_t drr_length;
+ uint64_t drr_toguid;
} drr_free;
+ struct drr_write_byref {
+ /* where to put the data */
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ /* where to find the prior copy of the data */
+ uint64_t drr_refguid;
+ uint64_t drr_refobject;
+ uint64_t drr_refoffset;
+ /* properties of the data */
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
+ } drr_write_byref;
+ struct drr_spill {
+ uint64_t drr_object;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint64_t drr_pad[4]; /* needed for crypto */
+ /* spill data follows */
+ } drr_spill;
} drr_u;
} dmu_replay_record_t;
+/* diff record range types */
+typedef enum diff_type {
+ DDR_NONE = 0x1,
+ DDR_INUSE = 0x2,
+ DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+ uint64_t ddr_type;
+ uint64_t ddr_first;
+ uint64_t ddr_last;
+} dmu_diff_record_t;
+
typedef struct zinject_record {
uint64_t zi_objset;
uint64_t zi_object;
@@ -119,6 +226,10 @@ typedef struct zinject_record {
uint64_t zi_type;
uint32_t zi_freq;
uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
} zinject_record_t;
#define ZINJECT_NULL 0x1
@@ -146,8 +257,9 @@ typedef enum zfs_case {
typedef struct zfs_cmd {
char zc_name[MAXPATHLEN];
- char zc_value[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
char zc_string[MAXNAMELEN];
+ char zc_top_ds[MAXPATHLEN];
uint64_t zc_guid;
uint64_t zc_nvlist_conf; /* really (char *) */
uint64_t zc_nvlist_conf_size;
@@ -162,11 +274,21 @@ typedef struct zfs_cmd {
uint64_t zc_history_len;
uint64_t zc_history_offset;
uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
zfs_share_t zc_share;
uint64_t zc_jailid;
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_pad[4]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
} zfs_cmd_t;
typedef struct zfs_useracct {
@@ -176,8 +298,10 @@ typedef struct zfs_useracct {
uint64_t zu_space;
} zfs_useracct_t;
-#define ZVOL_MAX_MINOR (1 << 16)
-#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
+#define ZFSDEV_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
+
+#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
#ifdef _KERNEL
@@ -191,7 +315,29 @@ extern int zfs_secpolicy_rename_perms(const char *from,
const char *to, cred_t *cr);
extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
+extern int zfs_unmount_snap(const char *, void *);
+
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+ ZSST_ZVOL,
+ ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+ enum zfs_soft_state_type zss_type;
+ void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+ enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+extern kmutex_t zfsdev_state_lock;
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
new file mode 100644
index 000000000000..4982bd4d0afc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_ONEXIT_H
+#define _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+ kmutex_t zo_lock;
+ list_t zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+ list_node_t za_link;
+ void (*za_func)(void *);
+ void *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+ boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+ void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
new file mode 100644
index 000000000000..fc40b0e9517c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_SA_H
+#define _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL. The values of the actual
+ * attributes are not defined by the order
+ * the enums. It is controlled by the attribute
+ * registration mechanism. Two different file system
+ * could have different numeric values for the same
+ * attributes. this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_END
+} zpl_attr_t;
+
+#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
+ sizeof (zfs_acl_phys_t))
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_zap; /* 144 - extra attributes */
+ uint64_t zp_pad[3]; /* 152 - future */
+ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we use this space for the following:
+ * - symbolic links
+ * - 32-byte anti-virus scanstamp (regular files only)
+ */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
new file mode 100644
index 000000000000..a8af7ec61ba9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_STAT_H
+#define _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl. zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+ uint64_t zs_gen;
+ uint64_t zs_mode;
+ uint64_t zs_links;
+ uint64_t zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 163a8000248b..c328a03b3ecf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_VFSOPS_H
@@ -29,6 +28,7 @@
#include <sys/list.h>
#include <sys/vfs.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#include <sys/rrwlock.h>
#include <sys/zfs_ioctl.h>
@@ -37,6 +37,7 @@ extern "C" {
#endif
typedef struct zfsvfs zfsvfs_t;
+struct znode;
struct zfsvfs {
vfs_t *z_vfs; /* generic fs struct */
@@ -54,7 +55,6 @@ struct zfsvfs {
boolean_t z_fuid_dirty; /* need to sync fuid table ? */
struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
zilog_t *z_log; /* intent log pointer */
- uint_t z_acl_mode; /* acl chmod/mode behavior */
uint_t z_acl_inherit; /* acl inheritance behavior */
zfs_case_t z_case; /* case-sense */
boolean_t z_utf8; /* utf8-only */
@@ -71,12 +71,14 @@ struct zfsvfs {
boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
- kmutex_t z_online_recv_lock; /* held while recv in progress */
+ boolean_t z_use_sa; /* version allow system attributes */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
+ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
};
@@ -132,19 +134,23 @@ typedef struct zfid_long {
extern uint_t zfs_fsyncer_key;
extern int zfs_super_owner;
-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valuep);
extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
- boolean_t isgroup, uint64_t fuid);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+ boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+ uint64_t fuid);
extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
-extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
+extern int zfs_vnode_lock(vnode_t *vp, int flags);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 6f0a43636010..d3955d7eee7b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ZNODE_H
@@ -29,8 +28,11 @@
#ifdef _KERNEL
#include <sys/list.h>
#include <sys/dmu.h>
+#include <sys/sa.h>
#include <sys/zfs_vfsops.h>
#include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -54,13 +56,18 @@ extern "C" {
#define ZFS_OPAQUE 0x0000010000000000
#define ZFS_AV_QUARANTINED 0x0000020000000000
#define ZFS_AV_MODIFIED 0x0000040000000000
+#define ZFS_REPARSE 0x0000080000000000
+#define ZFS_OFFLINE 0x0000100000000000
+#define ZFS_SPARSE 0x0000200000000000
-#define ZFS_ATTR_SET(zp, attr, value) \
+#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
{ \
if (value) \
- zp->z_phys->zp_flags |= attr; \
+ pflags |= attr; \
else \
- zp->z_phys->zp_flags &= ~attr; \
+ pflags &= ~attr; \
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+ &pflags, sizeof (pflags), tx)); \
}
/*
@@ -76,25 +83,46 @@ extern "C" {
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
+#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
+#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
+#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
+#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
+#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
+#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
+#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
+#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
+#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
+#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
+#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
+#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
+#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
+#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
+#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
+#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
+#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
+#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
+#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
+
/*
* Is ID ephemeral?
*/
-#ifdef TODO
#define IS_EPHEMERAL(x) (x > MAXUID)
-#else
-#define IS_EPHEMERAL(x) (0)
-#endif
/*
* Should we use FUIDs?
*/
-#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\
+#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
#define MASTER_NODE_OBJ 1
/*
* Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
*/
#define ZFS_FSID "FSID"
#define ZFS_UNLINKED_SET "DELETE_QUEUE"
@@ -102,6 +130,7 @@ extern "C" {
#define ZPL_VERSION_STR "VERSION"
#define ZFS_FUID_TABLES "FUID"
#define ZFS_SHARES_DIR "SHARES"
+#define ZFS_SA_ATTRS "SA_ATTRS"
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
@@ -132,42 +161,6 @@ extern "C" {
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
/*
- * This is the persistent portion of the znode. It is stored
- * in the "bonus buffer" of the file. Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
- uint64_t zp_atime[2]; /* 0 - last file access time */
- uint64_t zp_mtime[2]; /* 16 - last file modification time */
- uint64_t zp_ctime[2]; /* 32 - last file change time */
- uint64_t zp_crtime[2]; /* 48 - creation time */
- uint64_t zp_gen; /* 64 - generation (txg of creation) */
- uint64_t zp_mode; /* 72 - file mode bits */
- uint64_t zp_size; /* 80 - size of file */
- uint64_t zp_parent; /* 88 - directory parent (`..') */
- uint64_t zp_links; /* 96 - number of links to file */
- uint64_t zp_xattr; /* 104 - DMU object for xattrs */
- uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
- uint64_t zp_flags; /* 120 - persistent flags */
- uint64_t zp_uid; /* 128 - file owner */
- uint64_t zp_gid; /* 136 - owning group */
- uint64_t zp_zap; /* 144 - extra attributes */
- uint64_t zp_pad[3]; /* 152 - future */
- zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
- /*
- * Data may pad out any remaining bytes in the znode buffer, eg:
- *
- * |<---------------------- dnode_phys (512) ------------------------>|
- * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
- * |<---- znode (264) ---->|<---- data (56) ---->|
- *
- * At present, we use this space for the following:
- * - symbolic links
- * - 32-byte anti-virus scanstamp (regular files only)
- */
-} znode_phys_t;
-
-/*
* Directory entry locks control access to directory entries.
* They are used to protect creates, deletes, and renames.
* Each directory znode has a mutex and a list of locked names.
@@ -196,20 +189,24 @@ typedef struct znode {
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
- uint64_t z_last_itx; /* last ZIL itx on this znode */
- uint64_t z_gen; /* generation (same as zp_gen) */
+ uint64_t z_gen; /* generation (cached) */
+ uint64_t z_size; /* file size (cached) */
+ uint64_t z_atime[2]; /* atime (cached) */
+ uint64_t z_links; /* file links (cached) */
+ uint64_t z_pflags; /* pflags (cached) */
+ uint64_t z_uid; /* uid fuid (cached) */
+ uint64_t z_gid; /* gid fuid (cached) */
+ mode_t z_mode; /* mode (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
list_node_t z_link_node; /* all znodes in fs link */
- /*
- * These are dmu managed fields.
- */
- znode_phys_t *z_phys; /* pointer to persistent znode */
- dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+ sa_handle_t *z_sa_hdl; /* handle to sa data */
+ boolean_t z_is_sa; /* are we native sa? */
/* FreeBSD-specific field. */
struct task z_task;
} znode_t;
@@ -277,7 +274,7 @@ VTOZ(vnode_t *vp)
#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
#define ZFS_VERIFY_ZP(zp) \
- if ((zp)->z_dbuf == NULL) { \
+ if ((zp)->z_sa_hdl == NULL) { \
ZFS_EXIT((zp)->z_zfsvfs); \
return (EIO); \
} \
@@ -319,14 +316,14 @@ VTOZ(vnode_t *vp)
#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
- zfs_time_stamper(zp, ACCESSED, NULL)
+ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
extern int zfs_init_fs(zfsvfs_t *, znode_t **);
extern void zfs_set_dataprop(objset_t *);
extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
dmu_tx_t *tx);
-extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+ uint64_t [2], boolean_t);
extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
@@ -349,7 +346,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
vattr_t *vap);
extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name);
+ znode_t *dzp, char *name, uint64_t foid);
+#define ZFS_NO_OBJECT 0 /* no object id */
extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name);
extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -366,7 +364,7 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
#endif
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index efbf65e287ee..a4c5575b2dba 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_ZIL_H
#define _SYS_ZIL_H
@@ -55,34 +56,40 @@ typedef struct zil_header {
uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */
- uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
uint64_t zh_flags; /* header flags */
- uint64_t zh_pad[4];
+ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+ uint64_t zh_pad[3];
} zil_header_t;
/*
* zh_flags bit settings
*/
-#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
/*
- * Log block trailer - structure at the end of the header and each log block
+ * Log block chaining.
+ *
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
*
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * The zio_eck_t contains a zec_cksum which for the intent log is
* the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
* number passed in the blk_cksum field of the blkptr_t
*/
-typedef struct zil_trailer {
- uint64_t zit_pad;
- blkptr_t zit_next_blk; /* next block in chain */
- uint64_t zit_nused; /* bytes in log block used */
- zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+ uint64_t zc_pad;
+ blkptr_t zc_next_blk; /* next block in chain */
+ uint64_t zc_nused; /* bytes in log block used */
+ zio_eck_t zc_eck; /* block trailer */
+} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
-#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
/*
* The words of a log block checksum.
@@ -150,16 +157,26 @@ typedef enum zil_create {
#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order. For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define TX_OOO(txtype) \
+ ((txtype) == TX_WRITE || \
+ (txtype) == TX_TRUNCATE || \
+ (txtype) == TX_SETATTR || \
+ (txtype) == TX_ACL_V0 || \
+ (txtype) == TX_ACL || \
+ (txtype) == TX_WRITE2)
+
+/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
* Each log record has a common structure at the beginning.
*
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number. The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record. The two sequence numbers are
- * different because the transactions can now be pushed out of order.
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
*/
typedef struct { /* common log record header */
uint64_t lrc_txtype; /* intent log transaction type */
@@ -169,6 +186,14 @@ typedef struct { /* common log record header */
} lr_t;
/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id */
+} lr_ooo_t;
+
+/*
* Handle option extended vattr attributes.
*
* Whenever new attributes are added the version number
@@ -258,7 +283,7 @@ typedef struct {
uint64_t lr_foid; /* file object to write */
uint64_t lr_offset; /* offset to write to */
uint64_t lr_length; /* user data length to write */
- uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ uint64_t lr_blkoff; /* no longer used */
blkptr_t lr_blkptr; /* spa block pointer for replay */
/* write data will follow for small writes */
} lr_write_t;
@@ -306,13 +331,34 @@ typedef struct {
*/
/*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
*/
typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
/* and put blkptr in log, rather than actual data) */
WR_COPIED, /* immediate - data is copied into lr_write_t */
WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+ WR_NUM_STATES /* number of states */
} itx_wr_state_t;
typedef struct itx {
@@ -321,30 +367,19 @@ typedef struct itx {
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
uint64_t itx_sod; /* record size on disk */
+ uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
- zilog_t *zgd_zilog; /* zilog */
- blkptr_t *zgd_bp; /* block pointer */
- struct rl *zgd_rl; /* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t();
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
-extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
extern void zil_init(void);
@@ -358,28 +393,33 @@ extern void zil_close(zilog_t *zilog);
extern void zil_replay(objset_t *os, void *arg,
zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void zil_itx_destroy(itx_t *itx);
+extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern void zil_commit(zilog_t *zilog, uint64_t oid);
-extern int zil_vdev_offline(char *osname, void *txarg);
-extern int zil_claim(char *osname, void *txarg);
-extern int zil_check_log_chain(char *osname, void *txarg);
+extern int zil_vdev_offline(const char *osname, void *txarg);
+extern int zil_claim(const char *osname, void *txarg);
+extern int zil_check_log_chain(const char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void zil_clean(zilog_t *zilog);
-extern int zil_is_committed(zilog_t *zilog);
+extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
-extern void zil_add_block(zilog_t *zilog, blkptr_t *bp);
-extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
+extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
+extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
-extern int zil_disable;
+extern int zil_replay_disable;
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
index 3f2582931d15..1d4c0cc6c1de 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_ZIL_IMPL_H
#define _SYS_ZIL_IMPL_H
@@ -43,12 +44,34 @@ typedef struct lwb {
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
zio_t *lwb_zio; /* zio for this buffer */
+ dmu_tx_t *lwb_tx; /* tx for log block allocation */
uint64_t lwb_max_txg; /* highest txg in this lwb */
- txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
} lwb_t;
/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+ list_t i_sync_list; /* list of synchronous itxs */
+ avl_tree_t i_async_tree; /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+ kmutex_t itxg_lock; /* lock for this structure */
+ uint64_t itxg_txg; /* txg for this chain */
+ uint64_t itxg_sod; /* total size on disk for this txg */
+ itxs_t *itxg_itxs; /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+ uint64_t ia_foid; /* file object id */
+ list_t ia_list; /* list of async itxs for this foid */
+ avl_node_t ia_node; /* AVL tree linkage */
+} itx_async_node_t;
+
+/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
@@ -57,6 +80,8 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
+#define ZIL_PREV_BLKS 16
+
/*
* Stable storage intent log management structure. One per dataset.
*/
@@ -68,9 +93,8 @@ struct zilog {
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* next itx sequence number */
- uint64_t zl_commit_seq; /* committed upto this number */
- uint64_t zl_lr_seq; /* log record sequence number */
+ uint64_t zl_lr_seq; /* on-disk log record sequence number */
+ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
uint64_t zl_replaying_seq; /* current replay seq number */
@@ -82,24 +106,39 @@ struct zilog {
uint8_t zl_replay; /* replaying records while set */
uint8_t zl_stop_sync; /* for debugging */
uint8_t zl_writer; /* boolean: write setup in progress */
- uint8_t zl_log_error; /* boolean: log write error */
- list_t zl_itx_list; /* in-memory itx list */
+ uint8_t zl_logbias; /* latency or throughput */
+ uint8_t zl_sync; /* synchronous or asynchronous */
+ int zl_parse_error; /* last zil_parse() error */
+ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
+ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
+ uint64_t zl_parse_blk_count; /* number of blocks parsed */
+ uint64_t zl_parse_lr_count; /* number of log records parsed */
+ uint64_t zl_next_batch; /* next batch number */
+ uint64_t zl_com_batch; /* committed batch number */
+ kcondvar_t zl_cv_batch[2]; /* batch condition variables */
+ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
+ list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
- uint64_t zl_prev_used; /* previous commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
- avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
+ zil_header_t zl_old_header; /* debugging aid */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
};
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
dva_t zn_dva;
avl_node_t zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
+
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+ sizeof (lr_write_t))
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 049c12202282..355f560f0fc7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZIO_H
@@ -38,12 +37,15 @@
extern "C" {
#endif
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
/*
* Gang block headers are self-checksumming and contain an array
@@ -51,16 +53,16 @@ typedef struct zio_block_tail {
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
+ sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
+ zio_eck_t zg_tail;
} zio_gbh_phys_t;
enum zio_checksum {
@@ -73,12 +75,19 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_FUNCTIONS
};
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+#define ZIO_CHECKSUM_MASK 0xffULL
+#define ZIO_CHECKSUM_VERIFY (1 << 8)
+
+#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
+#define ZIO_DEDUPDITTO_MIN 100
+
enum zio_compress {
ZIO_COMPRESS_INHERIT = 0,
ZIO_COMPRESS_ON,
@@ -94,12 +103,19 @@ enum zio_compress {
ZIO_COMPRESS_GZIP_7,
ZIO_COMPRESS_GZIP_8,
ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
ZIO_COMPRESS_FUNCTIONS
};
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+#define BOOTFS_COMPRESS_VALID(compress) \
+ ((compress) == ZIO_COMPRESS_LZJB || \
+ ((compress) == ZIO_COMPRESS_ON && \
+ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+ (compress) == ZIO_COMPRESS_OFF)
+
#define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
@@ -115,73 +131,81 @@ enum zio_compress {
#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
-#define ZIO_PRIORITY_TABLE_SIZE 11
-
-#define ZIO_FLAG_MUSTSUCCEED 0x000000
-#define ZIO_FLAG_CANFAIL 0x000001
-#define ZIO_FLAG_SPECULATIVE 0x000002
-#define ZIO_FLAG_CONFIG_WRITER 0x000004
-#define ZIO_FLAG_DONT_RETRY 0x000008
-
-#define ZIO_FLAG_DONT_CACHE 0x000010
-#define ZIO_FLAG_DONT_QUEUE 0x000020
-#define ZIO_FLAG_DONT_AGGREGATE 0x000040
-#define ZIO_FLAG_DONT_PROPAGATE 0x000080
-
-#define ZIO_FLAG_IO_BYPASS 0x000100
-#define ZIO_FLAG_IO_REPAIR 0x000200
-#define ZIO_FLAG_IO_RETRY 0x000400
-#define ZIO_FLAG_IO_REWRITE 0x000800
-
-#define ZIO_FLAG_SELF_HEAL 0x001000
-#define ZIO_FLAG_RESILVER 0x002000
-#define ZIO_FLAG_SCRUB 0x004000
-#define ZIO_FLAG_SCRUB_THREAD 0x008000
-
-#define ZIO_FLAG_PROBE 0x010000
-#define ZIO_FLAG_GANG_CHILD 0x020000
-#define ZIO_FLAG_RAW 0x040000
-#define ZIO_FLAG_GODFATHER 0x080000
-
-#define ZIO_FLAG_TRYHARD 0x100000
-
-#define ZIO_FLAG_GANG_INHERIT \
- (ZIO_FLAG_CANFAIL | \
- ZIO_FLAG_SPECULATIVE | \
- ZIO_FLAG_CONFIG_WRITER | \
- ZIO_FLAG_DONT_RETRY | \
- ZIO_FLAG_DONT_CACHE | \
- ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
-
-#define ZIO_FLAG_VDEV_INHERIT \
- (ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_IO_RETRY | \
- ZIO_FLAG_PROBE | \
- ZIO_FLAG_TRYHARD)
-
-#define ZIO_FLAG_AGG_INHERIT \
- (ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
+#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
+#define ZIO_PRIORITY_TABLE_SIZE 12
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
+enum zio_flag {
+ /*
+ * Flags inherited by gang, ddt, and vdev children,
+ * and that must be equal for two zios to aggregate
+ */
+ ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+ ZIO_FLAG_IO_REPAIR = 1 << 1,
+ ZIO_FLAG_SELF_HEAL = 1 << 2,
+ ZIO_FLAG_RESILVER = 1 << 3,
+ ZIO_FLAG_SCRUB = 1 << 4,
+ ZIO_FLAG_SCAN_THREAD = 1 << 5,
+
+#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
+
+ /*
+ * Flags inherited by ddt, gang, and vdev children.
+ */
+ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 7,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 8,
+ ZIO_FLAG_DONT_RETRY = 1 << 9,
+ ZIO_FLAG_DONT_CACHE = 1 << 10,
+ ZIO_FLAG_NODATA = 1 << 11,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+
+#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+
+ /*
+ * Flags inherited by vdev children.
+ */
+ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 14,
+ ZIO_FLAG_TRYHARD = 1 << 15,
+ ZIO_FLAG_OPTIONAL = 1 << 16,
+
+#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
+
+ /*
+ * Flags not inherited by any children.
+ */
+ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
+ ZIO_FLAG_IO_BYPASS = 1 << 19,
+ ZIO_FLAG_IO_REWRITE = 1 << 20,
+ ZIO_FLAG_RAW = 1 << 21,
+ ZIO_FLAG_GANG_CHILD = 1 << 22,
+ ZIO_FLAG_DDT_CHILD = 1 << 23,
+ ZIO_FLAG_GODFATHER = 1 << 24
+};
+
+#define ZIO_FLAG_MUSTSUCCEED 0
+
+#define ZIO_DDT_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
+ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
#define ZIO_GANG_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+#define ZIO_VDEV_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
+ ZIO_FLAG_CANFAIL)
+
enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
+ ZIO_CHILD_DDT,
ZIO_CHILD_LOGICAL,
ZIO_CHILD_TYPES
};
@@ -193,13 +217,13 @@ enum zio_wait_type {
};
/*
- * We'll take the EILSEQ and ENOMSG to indicate checksum errors and
- * fragmentation.
+ * We'll take the number 122 and 123 to indicate checksum errors and
+ * fragmentation. Those doesn't collide with any errno values as they
+ * are greater than ELAST.
*/
-#define ECKSUM EILSEQ
-#define EFRAGS ENOMSG
+#define ECKSUM 122
+#define EFRAGS 123
-typedef struct zio zio_t;
typedef void zio_done_func_t(zio_t *zio);
extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -208,18 +232,15 @@ extern char *zio_type_name[ZIO_TYPES];
/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
* identifies any block in the pool. By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number. In summary:
+ * is objset 0, and the meta-dnode is object 0. This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
*
- * mos is objset 0
- * meta-dnode is object 0
- * root block is <objset, 0, -1, 0>
- * intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
*
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse. The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
*
* Note: this structure is passed between userland and the kernel.
* Therefore it must not change size or alignment between 32/64 bit
@@ -232,14 +253,66 @@ typedef struct zbookmark {
uint64_t zb_blkid;
} zbookmark_t;
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define ZB_DESTROYED_OBJSET (-1ULL)
+
+#define ZB_ROOT_OBJECT (0ULL)
+#define ZB_ROOT_LEVEL (-1LL)
+#define ZB_ROOT_BLKID (0ULL)
+
+#define ZB_ZIL_OBJECT (0ULL)
+#define ZB_ZIL_LEVEL (-2LL)
+
typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
dmu_object_type_t zp_type;
uint8_t zp_level;
- uint8_t zp_ndvas;
+ uint8_t zp_copies;
+ uint8_t zp_dedup;
+ uint8_t zp_dedup_verify;
} zio_prop_t;
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+ const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum; /* defined in zio_checksum.h */
+
+struct zio_cksum_report {
+ struct zio_cksum_report *zcr_next;
+ nvlist_t *zcr_ereport;
+ nvlist_t *zcr_detector;
+ void *zcr_cbdata;
+ size_t zcr_cbinfo; /* passed to zcr_free() */
+ uint64_t zcr_align;
+ uint64_t zcr_length;
+ zio_cksum_finish_f *zcr_finish;
+ zio_cksum_free_f *zcr_free;
+
+ /* internal use only */
+ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+ void *arg);
+
+zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+ zio_done_func_t *vsd_free;
+ zio_vsd_cksum_report_f *vsd_cksum_report;
+} zio_vsd_ops_t;
+
typedef struct zio_gang_node {
zio_gbh_phys_t *gn_gbh;
struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
@@ -290,6 +363,7 @@ struct zio {
uint64_t io_txg;
spa_t *io_spa;
blkptr_t *io_bp;
+ blkptr_t *io_bp_override;
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
@@ -301,16 +375,20 @@ struct zio {
zio_done_func_t *io_ready;
zio_done_func_t *io_done;
void *io_private;
+ int64_t io_prev_space_delta; /* DMU private */
blkptr_t io_bp_orig;
/* Data represented by this I/O */
void *io_data;
+ void *io_orig_data;
uint64_t io_size;
+ uint64_t io_orig_size;
/* Stuff for the vdev stack */
vdev_t *io_vd;
void *io_vsd;
- zio_done_func_t *io_vsd_free;
+ const zio_vsd_ops_t *io_vsd_ops;
+
uint64_t io_offset;
uint64_t io_deadline;
avl_node_t io_offset_node;
@@ -318,15 +396,17 @@ struct zio {
avl_tree_t *io_vdev_tree;
/* Internal pipeline state */
- int io_flags;
- zio_stage_t io_stage;
- uint32_t io_pipeline;
- int io_orig_flags;
- zio_stage_t io_orig_stage;
- uint32_t io_orig_pipeline;
+ enum zio_flag io_flags;
+ enum zio_stage io_stage;
+ enum zio_stage io_pipeline;
+ enum zio_flag io_orig_flags;
+ enum zio_stage io_orig_stage;
+ enum zio_stage io_orig_pipeline;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@@ -336,6 +416,7 @@ struct zio {
kcondvar_t io_cv;
/* FMA state */
+ zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
#ifdef _KERNEL
@@ -346,47 +427,53 @@ struct zio {
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_root(spa_t *spa,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, zbookmark_t *zb);
+ int priority, enum zio_flag flags, zbookmark_t *zb);
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
@@ -407,11 +494,11 @@ extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
@@ -420,8 +507,12 @@ extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+ enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+ enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+ enum zio_compress parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
extern int zio_resume(spa_t *spa);
@@ -443,9 +534,30 @@ extern int zio_inject_fault(char *name, int flags, int *id,
extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+ uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index da407399da06..0956c04ab1b4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
@@ -43,28 +42,31 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
typedef struct zio_checksum_info {
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
+ int ci_eck; /* uses zio embedded checksum? */
+ int ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
+typedef struct zio_bad_cksum {
+ zio_cksum_t zbc_expected;
+ zio_cksum_t zbc_actual;
+ const char *zbc_checksum_name;
+ uint8_t zbc_byteswapped;
+ uint8_t zbc_injected;
+ uint8_t zbc_has_cksum; /* expected/actual valid */
+} zio_bad_cksum_t;
+
extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
extern zio_checksum_t zio_checksum_SHA256;
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
index 66ee8d45b3b6..30bed1a676e3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
@@ -20,15 +20,13 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zio.h>
#ifdef __cplusplus
@@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
/*
* Compress and decompress data if necessary.
*/
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
- void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
index e7503b733cc0..d90bd8bd5921 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,104 +34,136 @@ extern "C" {
#endif
/*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
*/
-typedef enum zio_stage {
- ZIO_STAGE_OPEN = 0, /* RWFCI */
+enum zio_stage {
+ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
- ZIO_STAGE_ISSUE_ASYNC, /* -W--- */
+ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
- ZIO_STAGE_READ_BP_INIT, /* R---- */
- ZIO_STAGE_WRITE_BP_INIT, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+ ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
- ZIO_STAGE_DVA_FREE, /* --F-- */
- ZIO_STAGE_DVA_CLAIM, /* ---C- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
- ZIO_STAGE_READY, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 15, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
- ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
- ZIO_STAGE_DONE, /* RWFCI */
- ZIO_STAGES
-} zio_stage_t;
+ ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
+};
-#define ZIO_INTERLOCK_STAGES \
- ((1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_INTERLOCK_STAGES \
+ (ZIO_STAGE_READY | \
+ ZIO_STAGE_DONE)
-#define ZIO_INTERLOCK_PIPELINE \
+#define ZIO_INTERLOCK_PIPELINE \
ZIO_INTERLOCK_STAGES
-#define ZIO_VDEV_IO_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_VDEV_IO_STAGES \
+ (ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_DONE | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DONE)
-#define ZIO_READ_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define ZIO_READ_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_CHECKSUM_VERIFY)
-#define ZIO_READ_PHYS_PIPELINE \
+#define ZIO_READ_PHYS_PIPELINE \
ZIO_READ_COMMON_STAGES
-#define ZIO_READ_PIPELINE \
- (ZIO_READ_COMMON_STAGES | \
- (1U << ZIO_STAGE_READ_BP_INIT))
+#define ZIO_READ_PIPELINE \
+ (ZIO_READ_COMMON_STAGES | \
+ ZIO_STAGE_READ_BP_INIT)
-#define ZIO_WRITE_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_ISSUE_ASYNC) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE))
-
-#define ZIO_WRITE_PHYS_PIPELINE \
- ZIO_WRITE_COMMON_STAGES
-
-#define ZIO_REWRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT))
-
-#define ZIO_WRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE))
-
-#define ZIO_GANG_STAGES \
- ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \
- (1U << ZIO_STAGE_GANG_ISSUE))
+#define ZIO_DDT_CHILD_READ_PIPELINE \
+ ZIO_READ_COMMON_STAGES
-#define ZIO_FREE_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_FREE))
+#define ZIO_DDT_READ_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_READ_BP_INIT | \
+ ZIO_STAGE_DDT_READ_START | \
+ ZIO_STAGE_DDT_READ_DONE)
-#define ZIO_CLAIM_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_CHECKSUM_GENERATE)
-#define ZIO_IOCTL_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
-#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE) | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_REWRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT)
+
+#define ZIO_WRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_CHILD_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_CHECKSUM_GENERATE | \
+ ZIO_STAGE_DDT_WRITE)
+
+#define ZIO_GANG_STAGES \
+ (ZIO_STAGE_GANG_ASSEMBLE | \
+ ZIO_STAGE_GANG_ISSUE)
+
+#define ZIO_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_DVA_FREE)
+
+#define ZIO_DDT_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_DDT_FREE)
+
+#define ZIO_CLAIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_DVA_CLAIM)
+
+#define ZIO_IOCTL_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_BLOCKING_STAGES \
+ (ZIO_STAGE_DVA_ALLOCATE | \
+ ZIO_STAGE_DVA_CLAIM | \
+ ZIO_STAGE_VDEV_IO_START)
extern void zio_inject_init(void);
extern void zio_inject_fini(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
new file mode 100644
index 000000000000..dcd63f7b5b91
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define zrl_add(_z) zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
index 2a6452aa433c..c0a0a69f71ca 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZVOL_H
#define _SYS_ZVOL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -43,26 +40,41 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
extern int zvol_check_volblocksize(uint64_t volblocksize);
extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, major_t);
+extern int zvol_create_minor(const char *);
extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
extern int zvol_set_volsize(const char *, major_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
+#ifdef sun
extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
-#ifndef __FreeBSD__
extern int zvol_strategy(buf_t *bp);
extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif
+#endif /* sun */
extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
int *rvalp);
extern int zvol_busy(void);
extern void zvol_init(void);
extern void zvol_fini(void);
+
+#ifdef sun
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+ ssize_t resid, boolean_t sync);
+#endif /* sun */
+
+#ifdef __FreeBSD__
+extern int zvol_create_minors(const char *name);
+#endif
+
#endif
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index c69c117500f0..0885f27116d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -19,14 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
#include <sys/callb.h>
/*
@@ -36,24 +37,13 @@
static void txg_sync_thread(void *arg);
static void txg_quiesce_thread(void *arg);
-int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */
-extern int zfs_txg_synctime;
-extern uint64_t zfs_write_limit_override;
+int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0,
- "ZFS transaction groups (TXG)");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
TUNABLE_INT("vfs.zfs.txg.timeout", &zfs_txg_timeout);
SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RDTUN, &zfs_txg_timeout, 0,
"Maximum seconds worth of delta per txg");
-TUNABLE_INT("vfs.zfs.txg.synctime", &zfs_txg_synctime);
-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime, CTLFLAG_RDTUN, &zfs_txg_synctime,
- 0, "Target seconds to sync a txg");
-TUNABLE_QUAD("vfs.zfs.txg.write_limit_override", &zfs_write_limit_override);
-SYSCTL_UQUAD(_vfs_zfs_txg, OID_AUTO, write_limit_override, CTLFLAG_RW,
- &zfs_write_limit_override, 0,
- "Override maximum size of a txg to this size in bytes, "
- "value of 0 means don't override");
/*
* Prepare the txg subsystem.
@@ -74,10 +64,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
for (i = 0; i < TXG_SIZE; i++) {
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
NULL);
+ list_create(&tx->tx_cpu[c].tc_callbacks[i],
+ sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
}
}
- rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
@@ -100,7 +92,6 @@ txg_fini(dsl_pool_t *dp)
ASSERT(tx->tx_threads == 0);
- rw_destroy(&tx->tx_suspend);
mutex_destroy(&tx->tx_sync_lock);
cv_destroy(&tx->tx_sync_more_cv);
@@ -113,10 +104,15 @@ txg_fini(dsl_pool_t *dp)
int i;
mutex_destroy(&tx->tx_cpu[c].tc_lock);
- for (i = 0; i < TXG_SIZE; i++)
+ for (i = 0; i < TXG_SIZE; i++) {
cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+ }
}
+ if (tx->tx_commit_cb_taskq != NULL)
+ taskq_destroy(tx->tx_commit_cb_taskq);
+
kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
bzero(tx, sizeof (tx_state_t));
@@ -196,7 +192,11 @@ txg_sync_stop(dsl_pool_t *dp)
* Finish off any work in progress.
*/
ASSERT(tx->tx_threads == 2);
- txg_wait_synced(dp, 0);
+
+ /*
+ * We need to ensure that we've vacated the deferred space_maps.
+ */
+ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
/*
* Wake all sync threads and wait for them to die.
@@ -246,6 +246,17 @@ txg_rele_to_quiesce(txg_handle_t *th)
}
void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+ mutex_exit(&tc->tc_lock);
+}
+
+void
txg_rele_to_sync(txg_handle_t *th)
{
tx_cpu_t *tc = th->th_cpu;
@@ -296,9 +307,61 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
}
static void
+txg_do_callbacks(void *arg)
+{
+ list_t *cb_list = arg;
+
+ dmu_tx_do_callbacks(cb_list, 0);
+
+ list_destroy(cb_list);
+
+ kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+ int c;
+ tx_state_t *tx = &dp->dp_tx;
+ list_t *cb_list;
+
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ /* No need to lock tx_cpu_t at this point */
+
+ int g = txg & TXG_MASK;
+
+ if (list_is_empty(&tc->tc_callbacks[g]))
+ continue;
+
+ if (tx->tx_commit_cb_taskq == NULL) {
+ /*
+ * Commit callback taskq hasn't been created yet.
+ */
+ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+ max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
+ TASKQ_PREPOPULATE);
+ }
+
+ cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(cb_list, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+
+ list_move_tail(&tc->tc_callbacks[g], cb_list);
+
+ (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+ txg_do_callbacks, cb_list, TQ_SLEEP);
+ }
+}
+
+static void
txg_sync_thread(void *arg)
{
dsl_pool_t *dp = arg;
+ spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
uint64_t start, delta;
@@ -311,20 +374,19 @@ txg_sync_thread(void *arg)
uint64_t txg;
/*
- * We sync when we're scrubbing, there's someone waiting
+ * We sync when we're scanning, there's someone waiting
* on us, or the quiesce thread has handed off a txg to
* us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
- while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
- spa_shutting_down(dp->dp_spa)) &&
+ while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
- delta = LBOLT - start;
+ delta = ddi_get_lbolt() - start;
timer = (delta > timeout ? 0 : timeout - delta);
}
@@ -342,8 +404,6 @@ txg_sync_thread(void *arg)
if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
- rw_enter(&tx->tx_suspend, RW_WRITER);
-
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
@@ -353,22 +413,24 @@ txg_sync_thread(void *arg)
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
- rw_exit(&tx->tx_suspend);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
- start = LBOLT;
- spa_sync(dp->dp_spa, txg);
- delta = LBOLT - start;
+ start = ddi_get_lbolt();
+ spa_sync(spa, txg);
+ delta = ddi_get_lbolt() - start;
mutex_enter(&tx->tx_sync_lock);
- rw_enter(&tx->tx_suspend, RW_WRITER);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
- rw_exit(&tx->tx_suspend);
cv_broadcast(&tx->tx_sync_done_cv);
+
+ /*
+ * Dispatch commit callbacks to worker threads.
+ */
+ txg_dispatch_callbacks(dp, txg);
}
}
@@ -426,7 +488,7 @@ void
txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
{
tx_state_t *tx = &dp->dp_tx;
- int timeout = LBOLT + ticks;
+ int timeout = ddi_get_lbolt() + ticks;
/* don't delay if this txg could transition to quiesing immediately */
if (tx->tx_open_txg > txg ||
@@ -439,10 +501,10 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
return;
}
- while (LBOLT < timeout &&
+ while (ddi_get_lbolt() < timeout &&
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
- timeout - LBOLT);
+ timeout - ddi_get_lbolt());
mutex_exit(&tx->tx_sync_lock);
}
@@ -455,7 +517,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
mutex_enter(&tx->tx_sync_lock);
ASSERT(tx->tx_threads == 2);
if (txg == 0)
- txg = tx->tx_open_txg;
+ txg = tx->tx_open_txg + TXG_DEFER_SIZE;
if (tx->tx_sync_txg_waiting < txg)
tx->tx_sync_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -506,21 +568,6 @@ txg_sync_waiting(dsl_pool_t *dp)
tx->tx_quiesced_txg != 0);
}
-void
-txg_suspend(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- /* XXX some code paths suspend when they are already suspended! */
- rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- rw_exit(&tx->tx_suspend);
-}
-
/*
* Per-txg object lists.
*/
@@ -578,6 +625,34 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
}
/*
+ * Add an entry to the end of the list (walks list to find end).
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ txg_node_t **tp;
+
+ for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+ continue;
+
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = NULL;
+ *tp = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
* Remove the head of the list and return it.
*/
void *
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
index 34d7e0c3ac74..692cda137f1a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/uberblock_impl.h>
#include <sys/vdev_impl.h>
@@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
ub->ub_txg = txg;
ub->ub_guid_sum = rvd->vdev_guid_sum;
ub->ub_timestamp = gethrestime_sec();
+ ub->ub_software_version = SPA_VERSION;
return (ub->ub_rootbp.blk_birth == txg);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index cb43af37f010..51a3c792deda 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -40,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/zil.h>
+#include <sys/dsl_scan.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -61,6 +61,7 @@ static vdev_ops_t *vdev_ops_table[] = {
#endif
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
@@ -95,9 +96,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
- uint64_t c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
@@ -106,40 +106,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
}
/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
*/
uint64_t
-vdev_get_rsize(vdev_t *vd)
+vdev_get_min_asize(vdev_t *vd)
{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
+ vdev_t *pvd = vd->vdev_parent;
+
+ /*
+ * The our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
- pvd = vd->vdev_parent;
+ /*
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
+ */
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
/*
- * If our parent is NULL or the root, just return our own psize.
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
*/
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return (pvd->vdev_min_asize / pvd->vdev_children);
- rsize = 0;
+ return (pvd->vdev_min_asize);
+}
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
- return (rsize);
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
@@ -160,13 +167,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
- int c;
vdev_t *mvd;
if (vd->vdev_guid == guid)
return (vd);
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL)
return (mvd);
@@ -212,9 +218,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
}
void
@@ -249,9 +252,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
}
/*
@@ -262,17 +262,17 @@ vdev_compact_children(vdev_t *pvd)
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
- int newc, c;
+ int newc;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- for (c = newc = 0; c < oldc; c++)
+ for (int c = newc = 0; c < oldc; c++)
if (pvd->vdev_child[c])
newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
- for (c = newc = 0; c < oldc; c++) {
+ for (int c = newc = 0; c < oldc; c++) {
if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd;
cvd->vdev_id = newc++;
@@ -287,7 +287,7 @@ vdev_compact_children(vdev_t *pvd)
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
@@ -299,21 +299,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
- while (guid == 0 || spa_guid_exists(guid, 0))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(NULL);
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
- while (guid == 0 ||
- spa_guid_exists(spa_guid(spa), guid))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(spa);
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
@@ -324,6 +321,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -384,6 +382,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
}
/*
@@ -400,6 +401,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
@@ -407,23 +411,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
- /*
- * Currently, we can only support 2 parity devices.
- */
- if (nparity == 0 || nparity > 2)
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (EINVAL);
/*
- * Older versions can only support 1 parity device.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (nparity == 2 &&
- spa_version(spa) < SPA_VERSION_RAID6)
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (ENOTSUP);
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
return (ENOTSUP);
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
- if (spa_version(spa) >= SPA_VERSION_RAID6)
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (EINVAL);
/*
* Otherwise, we default to 1 parity device for RAID-Z.
@@ -471,43 +476,86 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ if (parent && !parent->vdev_parent &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ }
+
+ if (parent && !parent->vdev_parent) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ vd->vdev_mg = metaslab_group_create(islog ?
+ spa_log_class(spa) : spa_normal_class(spa), vd);
}
/*
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
- (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ &vd->vdev_resilvering);
+
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
- * valid in the current context.
+ * valid in the current context. Local vdevs will
+ * remain in the faulted state.
*/
- if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ if (spa_load_state(spa) == SPA_LOAD_OPEN) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
}
}
@@ -524,7 +572,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
void
vdev_free(vdev_t *vd)
{
- int c;
spa_t *spa = vd->vdev_spa;
/*
@@ -534,11 +581,12 @@ vdev_free(vdev_t *vd)
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
*/
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL);
@@ -547,8 +595,10 @@ vdev_free(vdev_t *vd)
/*
* Discard allocation state.
*/
- if (vd == vd->vdev_top)
+ if (vd->vdev_mg != NULL) {
vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ }
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -668,14 +718,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
static void
vdev_top_update(vdev_t *tvd, vdev_t *vd)
{
- int c;
-
if (vd == NULL)
return;
vd->vdev_top = tvd;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_top_update(tvd, vd->vdev_child[c]);
}
@@ -694,8 +742,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
@@ -737,6 +787,7 @@ vdev_remove_parent(vdev_t *cvd)
*/
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
@@ -756,16 +807,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
- metaslab_class_t *mc;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -777,14 +834,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
ASSERT(oldc <= newc);
- if (vd->vdev_islog)
- mc = spa->spa_log_class;
- else
- mc = spa->spa_normal_class;
-
- if (vd->vdev_mg == NULL)
- vd->vdev_mg = metaslab_group_create(mc, vd);
-
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (oldc != 0) {
@@ -819,6 +868,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
}
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (oldc == 0 && !vd->vdev_removing)
+ metaslab_group_activate(vd->vdev_mg);
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
return (0);
}
@@ -829,6 +892,7 @@ vdev_metaslab_fini(vdev_t *vd)
uint64_t count = vd->vdev_ms_count;
if (vd->vdev_ms != NULL) {
+ metaslab_group_passivate(vd->vdev_mg);
for (m = 0; m < count; m++)
if (vd->vdev_ms[m] != NULL)
metaslab_fini(vd->vdev_ms[m]);
@@ -956,6 +1020,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -988,6 +1056,55 @@ vdev_probe(vdev_t *vd, zio_t *zio)
return (NULL);
}
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+ taskq_t *tq;
+ int children = vd->vdev_children;
+
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (B_TRUE || vdev_uses_zvols(vd)) {
+ for (int c = 0; c < children; c++)
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ return;
+ }
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+ TQ_SLEEP) != 0);
+
+ taskq_destroy(tq);
+}
+
/*
* Prepare a virtual device for access.
*/
@@ -996,13 +1113,12 @@ vdev_open(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int error;
- int c;
uint64_t osize = 0;
uint64_t asize, psize;
uint64_t ashift = 0;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1010,11 +1126,18 @@ vdev_open(vdev_t *vd)
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
- VDEV_AUX_ERR_EXCEEDED);
+ vd->vdev_label_aux);
return (ENXIO);
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
@@ -1024,6 +1147,11 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, NULL, ENXIO);
@@ -1039,20 +1167,40 @@ vdev_open(vdev_t *vd)
vd->vdev_removed = B_FALSE;
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (ENXIO);
+ }
+
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
- for (c = 0; c < vd->vdev_children; c++)
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
+ }
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
@@ -1077,6 +1225,15 @@ vdev_open(vdev_t *vd)
vd->vdev_psize = psize;
+ /*
+ * Make sure the allocatable size hasn't shrunk.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
@@ -1093,25 +1250,18 @@ vdev_open(vdev_t *vd)
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
+ }
- /*
- * Make sure the device hasn't shrunk.
- */
- if (asize < vd->vdev_asize) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth. If automatic
+ * expansion is enabled then use the additional space.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand))
+ vd->vdev_asize = asize;
- /*
- * If all children are healthy and the asize has increased,
- * then we've experienced dynamic LUN growth.
- */
- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
- asize > vd->vdev_asize) {
- vd->vdev_asize = asize;
- }
- }
+ vdev_set_min_asize(vd);
/*
* Ensure we can issue some IO before declaring the
@@ -1119,8 +1269,8 @@ vdev_open(vdev_t *vd)
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_IO_FAILURE);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
return (error);
}
@@ -1150,12 +1300,11 @@ int
vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- int c;
nvlist_t *label;
- uint64_t guid, top_guid;
+ uint64_t guid = 0, top_guid;
uint64_t state;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF);
@@ -1165,6 +1314,8 @@ vdev_validate(vdev_t *vd)
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ uint64_t aux_guid = 0;
+ nvlist_t *nvl;
if ((label = vdev_label_read_config(vd)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1172,6 +1323,18 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ return (0);
+ }
+
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&guid) != 0 || guid != spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1180,6 +1343,11 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
@@ -1187,12 +1355,16 @@ vdev_validate(vdev_t *vd)
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
- (vd->vdev_guid != guid &&
+ ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -1211,11 +1383,11 @@ vdev_validate(vdev_t *vd)
nvlist_free(label);
/*
- * If spa->spa_load_verbatim is true, no need to check the
+ * If this is a verbatim import, no need to check the
* state of the pool.
*/
- if (!spa->spa_load_verbatim &&
- spa->spa_load_state == SPA_LOAD_OPEN &&
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
@@ -1238,15 +1410,23 @@ void
vdev_close(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ vdev_t *pvd = vd->vdev_parent;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
/*
- * We record the previous state before we close it, so that if we are
+ * We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
@@ -1260,12 +1440,49 @@ vdev_close(vdev_t *vd)
}
void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
vdev_close(vd);
(void) vdev_open(vd);
@@ -1278,12 +1495,8 @@ vdev_reopen(vdev_t *vd)
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
vd->vdev_aux == &spa->spa_l2cache &&
- !l2arc_vdev_present(vd)) {
- uint64_t size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ !l2arc_vdev_present(vd))
+ l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd);
}
@@ -1323,33 +1536,23 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
return (0);
}
-/*
- * The is the latter half of vdev_create(). It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
void
-vdev_init(vdev_t *vd, uint64_t txg)
+vdev_metaslab_set_size(vdev_t *vd)
{
/*
* Aim for roughly 200 metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
- /*
- * Initialize the vdev's metaslabs. This can't fail because
- * there's nothing to read when creating all new metaslabs.
- */
- VERIFY(vdev_metaslab_init(vd, txg) == 0);
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1364,7 +1567,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication. There are three kinds of DTL:
+ * the vdev has less than perfect replication. There are four kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
@@ -1405,6 +1608,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size))
@@ -1458,14 +1662,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
- (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
- /* XXX should check scrub_done? */
+ (spa->spa_scrub_started ||
+ (scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -1550,6 +1756,8 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
@@ -1577,6 +1785,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
@@ -1655,6 +1865,9 @@ vdev_dtl_required(vdev_t *vd)
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ if (!required && zio_injection_enabled)
+ required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
return (required);
}
@@ -1713,7 +1926,7 @@ vdev_load(vdev_t *vd)
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1770,11 +1983,49 @@ vdev_validate_aux(vdev_t *vd)
}
void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
+void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+ ASSERT(!vd->vdev_ishole);
+
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
@@ -1790,6 +2041,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1800,6 +2053,12 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1822,11 +2081,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
* not be opened, and no I/O is attempted.
*/
int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1834,19 +2093,28 @@ vdev_fault(spa_t *spa, uint64_t guid)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+
+ /*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
/*
* Faulted state takes precedence over degraded.
*/
+ vd->vdev_delayed_close = B_FALSE;
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
- vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
/*
- * If marking the vdev as faulted cause the top-level vdev to become
- * unavailable, then back off and simply mark the vdev as degraded
- * instead.
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
*/
- if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -1854,12 +2122,10 @@ vdev_fault(spa_t *spa, uint64_t guid)
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
- vdev_reopen(vd);
+ vdev_reopen(tvd);
- if (vdev_readable(vd)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
- }
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
}
return (spa_vdev_state_exit(spa, vd, 0));
@@ -1871,11 +2137,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
* as I/O is concerned.
*/
int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1892,7 +2158,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
+ aux);
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -1906,9 +2172,9 @@ vdev_degrade(spa_t *spa, uint64_t guid)
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1916,13 +2182,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
- vdev_reopen(vd->vdev_top);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+ }
+
+ vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
@@ -1931,16 +2210,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *vd, *tvd;
- int error;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
- spa_vdev_state_enter(spa);
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1949,6 +2238,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
/*
* If the device isn't already offline, try to offline it.
@@ -1964,6 +2255,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_offline_log(spa);
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+ }
+
+ /*
* Offline this device and reopen its top-level vdev.
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
@@ -1978,28 +2300,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
- if (!tvd->vdev_islog || !vdev_is_dead(tvd))
- return (spa_vdev_state_exit(spa, vd, 0));
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
- (void) spa_vdev_state_exit(spa, vd, 0);
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
- NULL, DS_FIND_CHILDREN);
- if (error) {
- (void) vdev_online(spa, guid, 0, NULL);
- return (error);
- }
- /*
- * If we successfully offlined the log device then we need to
- * sync out the current txg so that the "stubby" block can be
- * removed by zil_sync().
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
- return (0);
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
}
/*
@@ -2033,13 +2357,22 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
- vd->vdev_faulted = vd->vdev_degraded = 0;
+ /*
+ * When reopening in reponse to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
- vdev_reopen(vd);
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
- if (vd != rvd)
+ vd->vdev_forcefault = B_FALSE;
+
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2047,12 +2380,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
}
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
@@ -2081,7 +2432,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole);
}
boolean_t
@@ -2111,10 +2462,11 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_rsize(vd);
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
mutex_exit(&vd->vdev_stat_lock);
/*
@@ -2131,7 +2483,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ cvs->vs_scan_removing = cvd->vdev_removing;
mutex_exit(&vd->vdev_stat_lock);
}
}
@@ -2148,6 +2500,19 @@ vdev_clear_stats(vdev_t *vd)
}
void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
spa_t *spa = zio->io_spa;
@@ -2191,8 +2556,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += psize;
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -2217,6 +2591,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
@@ -2230,14 +2612,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD))) {
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
+ spa->spa_claiming)) {
/*
- * This is either a normal write (not a repair), or it's a
- * repair induced by the scrub thread. In the normal case,
- * we commit the DTL change in the same txg as the block
- * was born. In the scrub-induced repair case, we know that
- * scrubs run in first-pass syncing context, so we commit
- * the DTL change in spa->spa_syncing_txg.
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
@@ -2246,13 +2631,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
- commit_txg = spa->spa_syncing_txg;
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
}
- ASSERT(commit_txg >= spa->spa_syncing_txg);
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2264,46 +2652,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- int c;
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
- boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -2319,28 +2680,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (update_root) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
- return;
-
+ if (mc == spa_normal_class(spa)) {
mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
+
+ if (mc != NULL) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc,
+ alloc_delta, defer_delta, space_delta, dspace_delta);
+ }
}
/*
@@ -2355,6 +2714,8 @@ vdev_config_dirty(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int c;
+ ASSERT(spa_writeable(spa));
+
/*
* If this is an aux vdev (as with l2cache and spare devices), then we
* update the vdev config manually and set the sync flag.
@@ -2392,7 +2753,7 @@ vdev_config_dirty(vdev_t *vd)
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
- aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
@@ -2413,7 +2774,8 @@ vdev_config_dirty(vdev_t *vd)
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
@@ -2442,6 +2804,7 @@ vdev_state_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
@@ -2454,7 +2817,7 @@ vdev_state_dirty(vdev_t *vd)
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node))
+ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
@@ -2481,13 +2844,18 @@ vdev_propagate_state(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
- int c;
vdev_t *child;
if (vd->vdev_children > 0) {
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
@@ -2551,15 +2919,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
/*
* If we are setting the vdev state to anything but an open state, then
- * always close the underlying device. Otherwise, we keep accessible
- * but invalid devices open forever. We don't call vdev_close() itself,
- * because that implies some extra checks (offline, etc) that we don't
- * want here. This is limited to leaf devices, because otherwise
- * closing the device will affect other children.
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
+ /*
+ * If we have brought this vdev back into service, we need
+ * to notify fmd so that it can gracefully repair any outstanding
+ * cases due to a missing device. We do this in all cases, even those
+ * that probably don't correlate to a repaired fault. This is sure to
+ * catch all cases, and we let the zfs-retire agent sort it out. If
+ * this is a transient state it's OK, as the retire agent will
+ * double-check the state of the vdev before repairing it.
+ */
+ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_prevstate != state)
+ zfs_post_state_change(spa, vd);
+
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2575,20 +2959,16 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
- /*
- * Indicate to the ZFS DE that this device has been removed, and
- * any recent errors should be ignored.
- */
- zfs_post_remove(spa, vd);
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
- * If we fail to open a vdev during an import, we mark it as
- * "not available", which signifies that it was never there to
- * begin with. Failure to open such a device is not considered
- * an error.
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
*/
- if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
@@ -2631,9 +3011,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
- case VDEV_AUX_IO_FAILURE:
- class = FM_EREPORT_ZFS_IO_FAILURE;
- break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
@@ -2682,7 +3059,7 @@ vdev_is_bootable(vdev_t *vd)
return (B_FALSE);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
@@ -2690,31 +3067,84 @@ vdev_is_bootable(vdev_t *vd)
return (B_TRUE);
}
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
+ */
void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t c, children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(nvd->vdev_top->vdev_islog);
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+ for (int c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf) {
/*
- * It would be nice to call vdev_offline()
- * directly but the pool isn't fully loaded and
- * the txg threads have not been started yet.
+ * Restore the persistent vdev state
*/
- spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
- vd->vdev_offline = val;
- vdev_reopen(vd->vdev_top);
- spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ nvd->vdev_offline = ovd->vdev_offline;
+ nvd->vdev_faulted = ovd->vdev_faulted;
+ nvd->vdev_degraded = ovd->vdev_degraded;
+ nvd->vdev_removed = ovd->vdev_removed;
+ }
+}
+
+/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
}
+ vdev_propagate_state(cvd);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 8fc3738cab37..7978d612501a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -184,7 +184,7 @@ vdev_cache_allocate(zio_t *zio)
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
- ve->ve_lastused = LBOLT;
+ ve->ve_lastused = ddi_get_lbolt();
ve->ve_data = zio_buf_alloc(VCBS);
avl_add(&vc->vc_offset_tree, ve);
@@ -201,9 +201,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL);
- if (ve->ve_lastused != LBOLT) {
+ if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
- ve->ve_lastused = LBOLT;
+ ve->ve_lastused = ddi_get_lbolt();
avl_add(&vc->vc_lastused_tree, ve);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 5db7a6aec2f6..d7417736b4ee 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -19,12 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
@@ -44,12 +43,71 @@ typedef struct vdev_disk_buf {
zio_t *vdb_io;
} vdev_disk_buf_t;
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+ ddi_devid_t devid;
+ char *minor;
+
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+ return;
+
+ /*
+ * Only prefetch path and devid info if the device has
+ * never been opened.
+ */
+ if (vd->vdev_tsd != NULL)
+ return;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ (void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
+ kmem_free(buf, len);
+ }
+
+ if (vd->vdev_name_vp == NULL)
+ (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
+
+ if (vd->vdev_devid != NULL &&
+ ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
+ (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
+ ddi_devid_str_free(minor);
+ ddi_devid_free(devid);
+ }
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ if (vd->vdev_name_vp) {
+ VN_RELE_ASYNC(vd->vdev_name_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_name_vp = NULL;
+ }
+ if (vd->vdev_devid_vp) {
+ VN_RELE_ASYNC(vd->vdev_devid_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_devid_vp = NULL;
+ }
+}
+
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
spa_t *spa = vd->vdev_spa;
vdev_disk_t *dvd;
- struct dk_minfo dkm;
+ struct dk_minfo_ext dkmext;
int error;
dev_t dev;
int otyp;
@@ -62,6 +120,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
return (EINVAL);
}
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ dvd = vd->vdev_tsd;
+ goto skip_open;
+ }
+
dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
/*
@@ -79,12 +147,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
*
* 3. Otherwise, the device may have moved. Try opening the device
* by the devid instead.
- *
- * If the vdev is part of the root pool, we avoid opening it by path.
- * We do this because there is no /dev path available early in boot,
- * and if we try to open the device by path at a later point, we can
- * deadlock when devfsadm attempts to open the underlying backing store
- * file.
*/
if (vd->vdev_devid != NULL) {
if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
@@ -96,7 +158,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
error = EINVAL; /* presume failure */
- if (vd->vdev_path != NULL && !spa_is_root(spa)) {
+ if (vd->vdev_path != NULL) {
ddi_devid_t devid;
if (vd->vdev_wholedisk == -1ULL) {
@@ -167,7 +229,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* as above. This hasn't been used in a very long time and we
* don't need to propagate its oddities to this edge condition.
*/
- if (error && vd->vdev_path != NULL && !spa_is_root(spa))
+ if (error && vd->vdev_path != NULL)
error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
kcred, &dvd->vd_lh, zfs_li);
}
@@ -202,6 +264,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
kmem_free(physpath, MAXPATHLEN);
}
+skip_open:
/*
* Determine the actual size of the device.
*/
@@ -224,11 +287,11 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* Determine the device's minimum transfer size.
* If the ioctl isn't supported, assume DEV_BSIZE.
*/
- if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
FKIOCTL, kcred, NULL) != 0)
- dkm.dki_lbsize = DEV_BSIZE;
+ dkmext.dki_pbsize = DEV_BSIZE;
- *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
+ *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
/*
* Clear the nowritecache bit, so that on a vdev_reopen() we will
@@ -244,7 +307,7 @@ vdev_disk_close(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
- if (dvd == NULL)
+ if (vd->vdev_reopening || dvd == NULL)
return;
if (dvd->vd_minor != NULL)
@@ -256,6 +319,7 @@ vdev_disk_close(vdev_t *vd)
if (dvd->vd_lh != NULL)
(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+ vd->vdev_delayed_close = B_FALSE;
kmem_free(dvd, sizeof (vdev_disk_t));
vd->vdev_tsd = NULL;
}
@@ -315,6 +379,11 @@ vdev_disk_ioctl_free(zio_t *zio)
kmem_free(zio->io_vsd, sizeof (struct dk_callback));
}
+static const zio_vsd_ops_t vdev_disk_vsd_ops = {
+ vdev_disk_ioctl_free,
+ zio_vsd_default_cksum_report
+};
+
static void
vdev_disk_ioctl_done(void *zio_arg, int error)
{
@@ -355,7 +424,7 @@ vdev_disk_io_start(zio_t *zio)
}
zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
- zio->io_vsd_free = vdev_disk_ioctl_free;
+ zio->io_vsd_ops = &vdev_disk_vsd_ops;
dkc->dkc_callback = vdev_disk_ioctl_done;
dkc->dkc_flag = FLUSH_VOLATILE;
@@ -427,14 +496,23 @@ vdev_disk_io_done(zio_t *zio)
* asynchronous removal of the device. Otherwise, probe the device and
* make sure it's still accessible.
*/
- if (zio->io_error == EIO) {
+ if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
vdev_disk_t *dvd = vd->vdev_tsd;
int state = DKIO_NONE;
if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+ /*
+ * We post the resource as soon as possible, instead of
+ * when the async removal actually happens, because the
+ * DE is using this information to discard previous I/O
+ * errors.
+ */
+ zfs_post_remove(zio->io_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
}
}
}
@@ -446,6 +524,8 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
+ vdev_disk_hold,
+ vdev_disk_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -488,6 +568,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+ *config = NULL;
for (l = 0; l < VDEV_LABELS; l++) {
uint64_t offset, state, txg = 0;
@@ -522,6 +603,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
kmem_free(label, sizeof (vdev_label_t));
(void) ldi_close(vd_lh, FREAD, kcred);
+ if (*config == NULL)
+ error = EIDRM;
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index 67bd110cd884..be3cefced741 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,6 +34,18 @@
* Virtual device vector for files.
*/
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
@@ -51,6 +62,17 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
return (EINVAL);
}
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ vp = vf->vf_vnode;
+ goto skip_open;
+ }
+
vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
/*
@@ -65,6 +87,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (error);
}
@@ -77,9 +101,13 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (vp->v_type != VREG) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (ENODEV);
}
#endif
+
+skip_open:
/*
* Determine the physical size of the file.
*/
@@ -92,6 +120,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (error) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (error);
}
@@ -106,12 +136,15 @@ vdev_file_close(vdev_t *vd)
{
vdev_file_t *vf = vd->vdev_tsd;
- if (vf == NULL)
+ if (vd->vdev_reopening || vf == NULL)
return;
- if (vf->vf_vnode != NULL)
+ if (vf->vf_vnode != NULL) {
(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
kcred, NULL);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
kmem_free(vf, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
}
@@ -168,6 +201,8 @@ vdev_ops_t vdev_file_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
+ vdev_file_hold,
+ vdev_file_rele,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -184,6 +219,8 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
+ vdev_file_hold,
+ vdev_file_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index fa42871ebd5d..4d4b63cb2a07 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -47,31 +47,39 @@ struct g_class zfs_vdev_class = {
DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+/*
+ * Don't send BIO_FLUSH.
+ */
+static int vdev_geom_bio_flush_disable = 0;
+TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
+ &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+
static void
vdev_geom_orphan(struct g_consumer *cp)
{
- struct g_geom *gp;
vdev_t *vd;
- int error;
g_topology_assert();
vd = cp->private;
- gp = cp->geom;
- error = cp->provider->error;
- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
- if (cp->acr + cp->acw + cp->ace > 0)
- g_access(cp, -cp->acr, -cp->acw, -cp->ace);
- ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
- g_detach(cp);
- g_destroy_consumer(cp);
- /* Destroy geom if there are no consumers left. */
- if (LIST_EMPTY(&gp->consumer)) {
- ZFS_LOG(1, "Destroyed geom %s.", gp->name);
- g_wither_geom(gp, error);
- }
- vd->vdev_tsd = NULL;
+ /*
+ * Orphan callbacks occur from the GEOM event thread.
+ * Concurrent with this call, new I/O requests may be
+ * working their way through GEOM about to find out
+ * (only once executed by the g_down thread) that we've
+ * been orphaned from our disk provider. These I/Os
+ * must be retired before we can detach our consumer.
+ * This is most easily achieved by acquiring the
+ * SPA ZIO configuration lock as a writer, but doing
+ * so with the GEOM topology lock held would cause
+ * a lock order reversal. Instead, rely on the SPA's
+ * async removal support to invoke a close on this
+ * vdev once it is safe to do so.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
}
@@ -223,16 +231,12 @@ vdev_geom_read_guid(struct g_consumer *cp)
uint64_t psize;
off_t offset, size;
uint64_t guid;
- int error, l, len, iszvol;
+ int error, l, len;
g_topology_assert_not();
pp = cp->provider;
ZFS_LOG(1, "Reading guid from %s...", pp->name);
- if (g_getattr("ZFS::iszvol", cp, &iszvol) == 0 && iszvol) {
- ZFS_LOG(1, "Skipping ZVOL-based provider %s.", pp->name);
- return (0);
- }
psize = pp->mediasize;
psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
@@ -270,11 +274,6 @@ vdev_geom_read_guid(struct g_consumer *cp)
return (guid);
}
-struct vdev_geom_find {
- uint64_t guid;
- struct g_consumer *cp;
-};
-
static void
vdev_geom_taste_orphan(struct g_consumer *cp)
{
@@ -283,25 +282,23 @@ vdev_geom_taste_orphan(struct g_consumer *cp)
cp->provider->name));
}
-static void
-vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
+static struct g_consumer *
+vdev_geom_attach_by_guid(uint64_t guid)
{
- struct vdev_geom_find *ap;
struct g_class *mp;
struct g_geom *gp, *zgp;
struct g_provider *pp;
- struct g_consumer *zcp;
- uint64_t guid;
+ struct g_consumer *cp, *zcp;
+ uint64_t pguid;
g_topology_assert();
- ap = arg;
-
zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
/* This orphan function should be never called. */
zgp->orphan = vdev_geom_taste_orphan;
zcp = g_new_consumer(zgp);
+ cp = NULL;
LIST_FOREACH(mp, &g_classes, class) {
if (mp == &zfs_vdev_class)
continue;
@@ -317,39 +314,29 @@ vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
continue;
}
g_topology_unlock();
- guid = vdev_geom_read_guid(zcp);
+ pguid = vdev_geom_read_guid(zcp);
g_topology_lock();
g_access(zcp, -1, 0, 0);
g_detach(zcp);
- if (guid != ap->guid)
+ if (pguid != guid)
continue;
- ap->cp = vdev_geom_attach(pp);
- if (ap->cp == NULL) {
+ cp = vdev_geom_attach(pp);
+ if (cp == NULL) {
printf("ZFS WARNING: Unable to attach to %s.\n",
pp->name);
continue;
}
- goto end;
+ break;
}
+ if (cp != NULL)
+ break;
}
+ if (cp != NULL)
+ break;
}
- ap->cp = NULL;
end:
g_destroy_consumer(zcp);
g_destroy_geom(zgp);
-}
-
-static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid)
-{
- struct vdev_geom_find *ap;
- struct g_consumer *cp;
-
- ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
- ap->guid = guid;
- g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL);
- cp = ap->cp;
- kmem_free(ap, sizeof(*ap));
return (cp);
}
@@ -360,6 +347,8 @@ vdev_geom_open_by_guid(vdev_t *vd)
char *buf;
size_t len;
+ g_topology_assert();
+
ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
cp = vdev_geom_attach_by_guid(vd->vdev_guid);
if (cp != NULL) {
@@ -387,8 +376,9 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
struct g_consumer *cp;
uint64_t guid;
+ g_topology_assert();
+
cp = NULL;
- g_topology_lock();
pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
if (pp != NULL) {
ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
@@ -410,7 +400,6 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
}
}
}
- g_topology_unlock();
return (cp);
}
@@ -420,7 +409,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
struct g_provider *pp;
struct g_consumer *cp;
- int error, owned;
+ size_t bufsize;
+ int error, lock;
/*
* We must have a pathname, and it must be absolute.
@@ -432,15 +422,22 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
vd->vdev_tsd = NULL;
- if ((owned = mtx_owned(&Giant)))
- mtx_unlock(&Giant);
+ if (mutex_owned(&spa_namespace_lock)) {
+ mutex_exit(&spa_namespace_lock);
+ lock = 1;
+ } else {
+ lock = 0;
+ }
+ DROP_GIANT();
+ g_topology_lock();
error = 0;
/*
- * If we're creating pool, just find GEOM provider by its name
- * and ignore GUID mismatches.
+ * If we're creating or splitting a pool, just find the GEOM provider
+ * by its name and ignore GUID mismatches.
*/
- if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_splitting_newspa == B_TRUE)
cp = vdev_geom_open_by_path(vd, 0);
else {
cp = vdev_geom_open_by_path(vd, 1);
@@ -472,7 +469,6 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
int i;
- g_topology_lock();
for (i = 0; i < 5; i++) {
error = g_access(cp, 0, 1, 0);
if (error == 0)
@@ -487,10 +483,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
vdev_geom_detach(cp, 0);
cp = NULL;
}
- g_topology_unlock();
}
- if (owned)
- mtx_lock(&Giant);
+ g_topology_unlock();
+ PICKUP_GIANT();
+ if (lock)
+ mutex_enter(&spa_namespace_lock);
if (cp == NULL) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
@@ -516,6 +513,12 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
*/
vd->vdev_nowritecache = B_FALSE;
+ if (vd->vdev_physpath != NULL)
+ spa_strfree(vd->vdev_physpath);
+ bufsize = sizeof("/dev/") + strlen(pp->name);
+ vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
+ snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
+
return (0);
}
@@ -528,30 +531,50 @@ vdev_geom_close(vdev_t *vd)
if (cp == NULL)
return;
vd->vdev_tsd = NULL;
+ vd->vdev_delayed_close = B_FALSE;
g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
}
static void
vdev_geom_io_intr(struct bio *bp)
{
+ vdev_t *vd;
zio_t *zio;
zio = bp->bio_caller1;
+ vd = zio->io_vd;
zio->io_error = bp->bio_error;
if (zio->io_error == 0 && bp->bio_resid != 0)
zio->io_error = EIO;
if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
- vdev_t *vd;
-
/*
* If we get ENOTSUP, we know that no future
* attempts will ever succeed. In this case we
* set a persistent bit so that we don't bother
* with the ioctl in the future.
*/
- vd = zio->io_vd;
vd->vdev_nowritecache = B_TRUE;
}
+ if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
+ /*
+ * If provider's error is set we assume it is being
+ * removed.
+ */
+ if (bp->bio_to->error != 0) {
+ /*
+ * We post the resource as soon as possible, instead of
+ * when the async removal actually happens, because the
+ * DE is using this information to discard previous I/O
+ * errors.
+ */
+ /* XXX: zfs_post_remove() can sleep. */
+ zfs_post_remove(zio->io_spa, vd);
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
g_destroy_bio(bp);
zio_interrupt(zio);
}
@@ -577,7 +600,7 @@ vdev_geom_io_start(zio_t *zio)
case DKIOCFLUSHWRITECACHE:
- if (zfs_nocacheflush)
+ if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
break;
if (vd->vdev_nowritecache) {
@@ -628,6 +651,16 @@ vdev_geom_io_done(zio_t *zio)
{
}
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
vdev_ops_t vdev_geom_ops = {
vdev_geom_open,
vdev_geom_close,
@@ -635,6 +668,8 @@ vdev_ops_t vdev_geom_ops = {
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
+ vdev_geom_hold,
+ vdev_geom_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index 48d5fc232b34..c08ed8ba0467 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -141,6 +140,7 @@
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
/*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare, boolean_t isl2cache)
+ vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare && !isl2cache)
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
== 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= SPA_VERSION_RAID6));
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
@@ -268,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
- if (!isspare && !isl2cache && vd == vd->vdev_top) {
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -279,42 +282,80 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
vd->vdev_islog) == 0);
+ if (vd->vdev_removing)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing) == 0);
}
if (vd->vdev_dtl_smo.smo_object != 0)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_crtxg)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ vd->vdev_crtxg) == 0);
+
if (getstats) {
vdev_stat_t vs;
+ pool_scan_stat_t ps;
+
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+ /* provide either current or previous scan information */
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ VERIFY(nvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+ == 0);
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c;
+ int c, idx;
+
+ ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare, isl2cache);
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ }
- for (c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
} else {
+ const char *aux = NULL;
+
if (vd->vdev_offline && !vd->vdev_tmpoffline)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
B_TRUE) == 0);
+ if (vd->vdev_resilvering)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ B_TRUE) == 0);
if (vd->vdev_faulted)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
B_TRUE) == 0);
@@ -327,11 +368,66 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_unspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
B_TRUE) == 0);
+ if (vd->vdev_ishole)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+ B_TRUE) == 0);
+
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+ aux) == 0);
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid) == 0);
+ }
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
nvlist_t *
vdev_label_read_config(vdev_t *vd)
{
@@ -478,6 +574,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
return (B_TRUE);
/*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
+ /*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
@@ -514,6 +619,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
crtxg, reason)) != 0)
return (error);
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
if (!vd->vdev_ops->vdev_op_leaf)
return (0);
@@ -526,7 +634,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Determine if the vdev is in use.
*/
- if (reason != VDEV_LABEL_REMOVE &&
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (EBUSY);
@@ -552,7 +660,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
*/
if (reason == VDEV_LABEL_SPARE)
return (0);
- ASSERT(reason == VDEV_LABEL_REPLACE);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
}
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
@@ -617,7 +726,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else {
- label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
/*
* Add our creation time. This allows us to detect multiple
@@ -642,8 +755,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+ bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
@@ -672,11 +785,9 @@ retry:
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
- }
+ vdev_label_write(zio, vd, l, ub,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
@@ -688,7 +799,7 @@ retry:
nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
@@ -717,11 +828,6 @@ retry:
*/
/*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
-/*
* Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we
* come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -750,6 +856,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
static void
vdev_uberblock_load_done(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data;
uberblock_t *ubbest = rio->io_private;
@@ -758,7 +865,7 @@ vdev_uberblock_load_done(zio_t *zio)
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
- if (ub->ub_txg <= ub_max_txg &&
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, ubbest) > 0)
*ubbest = *ub;
mutex_exit(&rio->io_lock);
@@ -976,6 +1083,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP);
+
+ ASSERT(!vd->vdev_ishole);
+
zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index fff7e0842256..698c0275d34e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio)
kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
}
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+ vdev_mirror_map_free,
+ zio_vsd_default_cksum_report
+};
+
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
@@ -117,28 +122,28 @@ vdev_mirror_map_alloc(zio_t *zio)
}
zio->io_vsd = mm;
- zio->io_vsd_free = vdev_mirror_map_free;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
return (mm);
}
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- vdev_t *cvd;
- uint64_t c;
int numerrors = 0;
- int ret, lasterror = 0;
+ int lasterror = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_open_children(vd);
- if ((ret = vdev_open(cvd)) != 0) {
- lasterror = ret;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
@@ -158,9 +163,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
static void
vdev_mirror_close(vdev_t *vd)
{
- uint64_t c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
@@ -211,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio)
uint64_t txg = zio->io_txg;
int i, c;
- ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
/*
* Try to find a child whose DTL doesn't contain the block to read.
@@ -449,6 +452,8 @@ vdev_ops_t vdev_mirror_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -460,6 +465,8 @@ vdev_ops_t vdev_replacing_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -471,6 +478,8 @@ vdev_ops_t vdev_spare_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dcec9..6a5588d59213 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
- *psize = SPA_MINDEVSIZE;
- *ashift = SPA_MINBLOCKSHIFT;
+ *psize = 0;
+ *ashift = 0;
return (0);
}
@@ -80,6 +80,21 @@ vdev_ops_t vdev_missing_ops = {
vdev_missing_io_start,
vdev_missing_io_done,
NULL,
+ NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
+
+vdev_ops_t vdev_hole_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ NULL,
+ NULL,
+ VDEV_TYPE_HOLE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index de3f1db75961..b44f3b289d9f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -24,7 +24,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
@@ -41,37 +40,48 @@
int zfs_vdev_max_pending = 10;
int zfs_vdev_min_pending = 4;
-/* deadline = pri + (LBOLT >> time_shift) */
+/* deadline = pri + ddi_get_lbolt64() >> time_shift) */
int zfs_vdev_time_shift = 6;
/* exponential I/O issue ramp-up rate */
int zfs_vdev_ramp_rate = 2;
/*
- * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
- * For read i/os, we also aggregate across small adjacency gaps.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
SYSCTL_DECL(_vfs_zfs_vdev);
TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW,
&zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW,
&zfs_vdev_min_pending, 0,
"Initial number of I/O requests pending to each device");
TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW,
&zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW,
&zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW,
&zfs_vdev_aggregation_limit, 0,
"I/O requests are aggregated up to this size");
+TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW,
+ &zfs_vdev_read_gap_limit, 0,
+ "Acceptable gap between two reads being aggregated");
+TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW,
+ &zfs_vdev_write_gap_limit, 0,
+ "Acceptable gap between two writes being aggregated");
/*
* Virtual device vector for disk I/O scheduling.
@@ -191,12 +201,14 @@ vdev_queue_agg_io_done(zio_t *aio)
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
- zio_t *fio, *lio, *aio, *dio, *nio;
+ zio_t *fio, *lio, *aio, *dio, *nio, *mio;
avl_tree_t *t;
int flags;
uint64_t maxspan = zfs_vdev_aggregation_limit;
uint64_t maxgap;
+ int stretch;
+again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
@@ -211,21 +223,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
- * We can aggregate I/Os that are adjacent and of the
- * same flavor, as expressed by the AGG_INHERIT flags.
- * The latter is necessary so that certain attributes
- * of the I/O, such as whether it's a normal I/O or a
- * scrub/resilver, can be preserved in the aggregate.
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
+
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-option I/O.
*/
while ((dio = AVL_PREV(t, fio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+ IO_SPAN(dio, lio) <= maxspan &&
+ IO_GAP(dio, fio) <= maxgap) {
fio = dio;
+ if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = fio;
+ }
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+ fio = AVL_NEXT(t, fio);
+ ASSERT(fio != NULL);
+ }
+
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ */
while ((dio = AVL_NEXT(t, lio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+ IO_SPAN(fio, dio) <= maxspan &&
+ IO_GAP(lio, dio) <= maxgap) {
lio = dio;
+ if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = lio;
+ }
+
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ stretch = B_FALSE;
+ if (t != &vq->vq_read_tree && mio != NULL) {
+ nio = lio;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (stretch) {
+ /* This may be a no-op. */
+ VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ while (lio != mio && lio != fio) {
+ ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+ lio = AVL_PREV(t, lio);
+ ASSERT(lio != NULL);
+ }
+ }
}
if (fio != lio) {
@@ -244,10 +323,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == t);
- if (dio->io_type == ZIO_TYPE_WRITE)
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+ bzero((char *)aio->io_data + (dio->io_offset -
+ aio->io_offset), dio->io_size);
+ } else if (dio->io_type == ZIO_TYPE_WRITE) {
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
+ }
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
@@ -263,6 +347,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
ASSERT(fio->io_vdev_tree == t);
vdev_queue_io_remove(vq, fio);
+ /*
+ * If the I/O is or was optional and therefore has no data, we need to
+ * simply discard it. We need to drop the vdev queue's lock to avoid a
+ * deadlock that we could encounter since this I/O will complete
+ * immediately.
+ */
+ if (fio->io_flags & ZIO_FLAG_NODATA) {
+ mutex_exit(&vq->vq_lock);
+ zio_vdev_io_bypass(fio);
+ zio_execute(fio);
+ mutex_enter(&vq->vq_lock);
+ goto again;
+ }
+
avl_add(&vq->vq_pending_tree, fio);
return (fio);
@@ -288,7 +386,8 @@ vdev_queue_io(zio_t *zio)
mutex_enter(&vq->vq_lock);
- zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
+ zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+ zio->io_priority;
vdev_queue_io_add(vq, zio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 92753d8714c0..4b0f5602c1d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,12 +34,27 @@
/*
* Virtual device vector for RAID-Z.
*
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
*
* o addition (+) is represented by a bitwise XOR
* o subtraction (-) is therefore identical to addition: A + B = A - B
@@ -55,22 +69,32 @@
* (A * 2)_0 = A_7
*
* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
*
* Observe that any number in the field (except for 0) can be expressed as a
* power of 2 -- a generator for the field. We store a table of the powers of
* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
*
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
*
* P = D_0 + D_1 + ... + D_n-2 + D_n-1
* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
+ *
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
*
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
*/
typedef struct raidz_col {
@@ -78,27 +102,60 @@ typedef struct raidz_col {
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
+ void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;
typedef struct raidz_map {
- uint64_t rm_cols; /* Column count */
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_nskip; /* Skipped sectors for padding */
+ uint64_t rm_skipstart; /* Column index of padding start */
+ void *rm_datacopy; /* rm_asize-buffer of copied data */
+ uintptr_t rm_reports; /* # of referencing checksum reports */
+ uint8_t rm_freed; /* map no longer has referencing ZIO */
+ uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
-#define VDEV_RAIDZ_MAXPARITY 2
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
-#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
+
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
/*
* These two tables represent powers and logs of 2 in the Galois field defined
@@ -173,6 +230,8 @@ static const uint8_t vdev_raidz_log2[256] = {
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
/*
* Multiply a given number by 2 raised to the given power.
*/
@@ -193,17 +252,184 @@ vdev_raidz_exp2(uint_t a, int exp)
}
static void
-vdev_raidz_map_free(zio_t *zio)
+vdev_raidz_map_free(raidz_map_t *rm)
{
- raidz_map_t *rm = zio->io_vsd;
int c;
+ size_t size;
- for (c = 0; c < rm->rm_firstdatacol; c++)
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ if (rm->rm_col[c].rc_gdata != NULL)
+ zio_buf_free(rm->rm_col[c].rc_gdata,
+ rm->rm_col[c].rc_size);
+ }
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ if (rm->rm_datacopy != NULL)
+ zio_buf_free(rm->rm_datacopy, size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT3U(rm->rm_freed, ==, 0);
+ rm->rm_freed = 1;
+
+ if (rm->rm_reports == 0)
+ vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ vdev_raidz_map_free(rm);
}
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ size_t c = zcr->zcr_cbinfo;
+ size_t x;
+
+ const char *good = NULL;
+ const char *bad = rm->rm_col[c].rc_data;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ if (c < rm->rm_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical ZIO)
+ */
+ if (rm->rm_col[0].rc_gdata == NULL) {
+ char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ char *buf;
+
+ /*
+ * Set up the rm_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ bad_parity[x] = rm->rm_col[x].rc_data;
+ rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+ zio_buf_alloc(rm->rm_col[x].rc_size);
+ }
+
+ /* fill in the data columns from good_data */
+ buf = (char *)good_data;
+ for (; x < rm->rm_cols; x++) {
+ rm->rm_col[x].rc_data = buf;
+ buf += rm->rm_col[x].rc_size;
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity(rm);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rm->rm_firstdatacol; x++)
+ rm->rm_col[x].rc_data = bad_parity[x];
+
+ buf = rm->rm_datacopy;
+ for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+ rm->rm_col[x].rc_data = buf;
+ buf += rm->rm_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+ good = rm->rm_col[c].rc_gdata;
+ } else {
+ /* adjust good_data to point at the start of our column */
+ good = good_data;
+
+ for (x = rm->rm_firstdatacol; x < c; x++)
+ good += rm->rm_col[x].rc_size;
+ }
+
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ caddr_t buf;
+
+ raidz_map_t *rm = zio->io_vsd;
+ size_t size;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_raidz_cksum_finish;
+ zcr->zcr_free = vdev_raidz_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (rm->rm_datacopy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them.
+ */
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ buf = rm->rm_datacopy = zio_buf_alloc(size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bcopy(col->rc_data, buf, col->rc_size);
+ col->rc_data = buf;
+
+ buf += col->rc_size;
+ }
+ ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+ vdev_raidz_map_free_vsd,
+ vdev_raidz_cksum_report
+};
+
static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
@@ -213,24 +439,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t s = zio->io_size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, coff, devidx;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
q = s / (dcols - nparity);
r = s - q * (dcols - nparity);
bc = (r == 0 ? 0 : r + nparity);
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ if (q == 0) {
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
- acols = (q == 0 ? bc : dcols);
+ ASSERT3U(acols, <=, scols);
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
rm->rm_cols = acols;
+ rm->rm_scols = scols;
rm->rm_bigcols = bc;
- rm->rm_asize = 0;
+ rm->rm_skipstart = bc;
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
+ rm->rm_datacopy = NULL;
+ rm->rm_reports = 0;
+ rm->rm_freed = 0;
+ rm->rm_ecksuminjected = 0;
+
+ asize = 0;
- for (c = 0; c < acols; c++) {
+ for (c = 0; c < scols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
@@ -239,15 +481,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
rm->rm_col[c].rc_skipped = 0;
- rm->rm_asize += rm->rm_col[c].rc_size;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
}
- rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+ ASSERT3U(asize, ==, tot << unit_shift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+ ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -272,6 +526,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
* Unfortunately, this decision created an implicit on-disk format
* requirement that we need to support for all eternity, but only
* for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
*/
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
@@ -283,10 +542,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
rm->rm_col[1].rc_devidx = devidx;
rm->rm_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
}
zio->io_vsd = rm;
- zio->io_vsd_free = vdev_raidz_map_free;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -305,12 +567,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
if (c == rm->rm_firstdatacol) {
ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p = *src;
}
} else {
ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p ^= *src;
}
}
@@ -320,10 +582,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
int c;
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
@@ -331,55 +593,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
src = rm->rm_col[c].rc_data;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount || ccount == 0);
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- *q = *src;
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
*p = *src;
+ *q = *src;
}
- for (; i < pcount; i++, p++, q++, src++) {
- *q = 0;
+ for (; i < pcnt; i++, src++, p++, q++) {
*p = 0;
+ *q = 0;
}
} else {
- ASSERT(ccount <= pcount);
+ ASSERT(ccnt <= pcnt);
/*
- * Rather than multiplying each byte individually (as
- * described above), we are able to handle 8 at once
- * by generating a mask based on the high bit in each
- * byte and using that to conditionally XOR in 0x1d.
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
*/
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
+ *p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
*q ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (; i < pcnt; i++, q++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+ uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ int c;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+ *p = *src;
+ *q = *src;
+ *r = *src;
+ }
+ for (; i < pcnt; i++, src++, p++, q++, r++) {
+ *p = 0;
+ *q = 0;
+ *r = 0;
+ }
+ } else {
+ ASSERT(ccnt <= pcnt);
+
+ /*
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
+ */
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
*p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ *q ^= *src;
+
+ VDEV_RAIDZ_64MUL_4(*r, mask);
+ *r ^= *src;
}
/*
* Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
*/
- for (; i < pcount; i++, q++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (; i < pcnt; i++, q++, r++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ VDEV_RAIDZ_64MUL_4(*r, mask);
}
}
}
}
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ switch (rm->rm_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rm);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rm);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rm);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+ }
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, i;
+ int x = tgts[0];
int c;
+ ASSERT(ntgts == 1);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(x < rm->rm_cols);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
ASSERT(xcount > 0);
@@ -404,15 +749,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
*dst ^= *src;
}
}
+
+ return (1 << VDEV_RAIDZ_P);
}
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, mask, i;
uint8_t *b;
+ int x = tgts[0];
int c, j, exp;
+ ASSERT(ntgts == 1);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
@@ -436,23 +786,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
}
} else {
- /*
- * For an explanation of this, see the comment in
- * vdev_raidz_generate_parity_pq() above.
- */
for (i = 0; i < count; i++, dst++, src++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
*dst ^= *src;
}
for (; i < xcount; i++, dst++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
}
}
}
@@ -467,15 +807,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
*b = vdev_raidz_exp2(*b, exp);
}
}
+
+ return (1 << VDEV_RAIDZ_Q);
}
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
void *pdata, *qdata;
uint64_t xsize, ysize, i;
+ int x = tgts[0];
+ int y = tgts[1];
+ ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(y < rm->rm_cols);
@@ -553,15 +898,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
*/
rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ }
+
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
+
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+ jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
+
+ /*
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
+
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
+
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
+
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
+
+ /*
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ }
+ }
}
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t log, val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rm->rm_cols);
+
+ src = rm->rm_col[c].rc_data;
+ ccount = rm->rm_col[c].rc_size;
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rm->rm_firstdatacol;
+ ASSERT3U(cc, >=, rm->rm_firstdatacol);
+ ASSERT3U(cc, <, rm->rm_cols);
+ ASSERT3U(cc, !=, c);
+
+ dst[j] = rm->rm_col[cc].rc_data;
+ dcount[j] = rm->rm_col[cc].rc_size;
+ }
+
+ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+ uint8_t *p, *pp;
+ size_t psize;
+
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ int code = 0;
+
+
+ n = rm->rm_cols - rm->rm_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
+ */
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rm->rm_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rm->rm_firstdatacol;
+ }
+ }
+
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rm->rm_firstdatacol);
+
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
+
+ code |= 1 << c;
+
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rm->rm_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
+ }
+
+ /*
+ * Initialize the interesting rows of the matrix.
+ */
+ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+ int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+ int ntgts;
+ int i, c;
+ int code;
+ int nbadparity, nbaddata;
+ int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+ /*
+ * The tgts list must already be sorted.
+ */
+ for (i = 1; i < nt; i++) {
+ ASSERT(t[i] > t[i - 1]);
+ }
+
+ nbadparity = rm->rm_firstdatacol;
+ nbaddata = rm->rm_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rm->rm_cols; c++) {
+ if (c < rm->rm_firstdatacol)
+ parity_valid[c] = B_FALSE;
+
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rm->rm_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rm->rm_firstdatacol) {
+ nbaddata--;
+ } else {
+ parity_valid[c] = B_TRUE;
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ dt = &tgts[nbadparity];
+
+ /*
+ * See if we can use any of our optimized reconstruction routines.
+ */
+ if (!vdev_raidz_default_to_general) {
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+ break;
+
+ case 2:
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+
+ break;
+ }
+ }
+
+ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
- int c, error;
+ int c;
int lasterror = 0;
int numerrors = 0;
@@ -573,11 +1457,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
return (EINVAL);
}
+ vdev_open_children(vd);
+
for (c = 0; c < vd->vdev_children; c++) {
cvd = vd->vdev_child[c];
- if ((error = vdev_open(cvd)) != 0) {
- lasterror = error;
+ if (cvd->vdev_open_error != 0) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
@@ -636,10 +1522,9 @@ vdev_raidz_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
- blkptr_t *bp = zio->io_bp;
raidz_map_t *rm;
raidz_col_t *rc;
- int c;
+ int c, i;
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
@@ -647,13 +1532,7 @@ vdev_raidz_io_start(zio_t *zio)
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * Generate RAID parity in the first virtual columns.
- */
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
@@ -664,6 +1543,23 @@ vdev_raidz_io_start(zio_t *zio)
vdev_raidz_child_done, rc));
}
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+ ASSERT(c <= rm->rm_scols);
+ if (c == rm->rm_scols)
+ c = 0;
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -671,8 +1567,7 @@ vdev_raidz_io_start(zio_t *zio)
/*
* Iterate over the columns in reverse order so that we hit the parity
- * last -- any errors along the way will force us to read the parity
- * data.
+ * last -- any errors along the way will force us to read the parity.
*/
for (c = rm->rm_cols - 1; c >= 0; c--) {
rc = &rm->rm_col[c];
@@ -687,7 +1582,7 @@ vdev_raidz_io_start(zio_t *zio)
rc->rc_skipped = 1;
continue;
}
- if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
@@ -708,23 +1603,47 @@ vdev_raidz_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+
/*
* Report a checksum error for a child of a RAID-Z device.
*/
static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
+
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+ &zbc);
}
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret != 0 && zbc.zbc_injected != 0)
+ rm->rm_ecksuminjected = 1;
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+ return (ret);
}
/*
@@ -748,17 +1667,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
bcopy(rc->rc_data, orig[c], rc->rc_size);
}
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
- raidz_checksum_error(zio, rc);
+ raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = ECKSUM;
ret++;
}
@@ -768,9 +1684,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
return (ret);
}
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
vdev_raidz_worst_error(raidz_map_t *rm)
@@ -783,19 +1700,177 @@ vdev_raidz_worst_error(raidz_map_t *rm)
return (error);
}
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *tgts = &tstore[1];
+ int current, next, i, c, n;
+ int code, ret = 0;
+
+ ASSERT(total_errors < rm->rm_firstdatacol);
+
+ /*
+ * This simplifies one edge condition.
+ */
+ tgts[-1] = -1;
+
+ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ /*
+ * Initialize the targets array by finding the first n columns
+ * that contain no error.
+ *
+ * If there were no data errors, we need to ensure that we're
+ * always explicitly attempting to reconstruct at least one
+ * data column. To do this, we simply push the highest target
+ * up into the data columns.
+ */
+ for (c = 0, i = 0; i < n; i++) {
+ if (i == n - 1 && data_errors == 0 &&
+ c < rm->rm_firstdatacol) {
+ c = rm->rm_firstdatacol;
+ }
+
+ while (rm->rm_col[c].rc_error != 0) {
+ c++;
+ ASSERT3S(c, <, rm->rm_cols);
+ }
+
+ tgts[i] = c++;
+ }
+
+ /*
+ * Setting tgts[n] simplifies the other edge condition.
+ */
+ tgts[n] = rm->rm_cols;
+
+ /*
+ * These buffers were allocated in previous iterations.
+ */
+ for (i = 0; i < n - 1; i++) {
+ ASSERT(orig[i] != NULL);
+ }
+
+ orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+ current = 0;
+ next = tgts[current];
+
+ while (current != n) {
+ tgts[current] = next;
+ current = 0;
+
+ /*
+ * Save off the original data that we're going to
+ * attempt to reconstruct.
+ */
+ for (i = 0; i < n; i++) {
+ ASSERT(orig[i] != NULL);
+ c = tgts[i];
+ ASSERT3S(c, >=, 0);
+ ASSERT3S(c, <, rm->rm_cols);
+ rc = &rm->rm_col[c];
+ bcopy(rc->rc_data, orig[i], rc->rc_size);
+ }
+
+ /*
+ * Attempt a reconstruction and exit the outer loop on
+ * success.
+ */
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
+
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_error == 0);
+ if (rc->rc_tried)
+ raidz_checksum_error(zio, rc,
+ orig[i]);
+ rc->rc_error = ECKSUM;
+ }
+
+ ret = code;
+ goto done;
+ }
+
+ /*
+ * Restore the original data.
+ */
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ bcopy(orig[i], rc->rc_data, rc->rc_size);
+ }
+
+ do {
+ /*
+ * Find the next valid column after the current
+ * position..
+ */
+ for (next = tgts[current] + 1;
+ next < rm->rm_cols &&
+ rm->rm_col[next].rc_error != 0; next++)
+ continue;
+
+ ASSERT(next <= tgts[current + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ */
+ if (next != tgts[current + 1])
+ break;
+
+ /*
+ * Otherwise, find the next valid column after
+ * the previous position.
+ */
+ for (c = tgts[current - 1] + 1;
+ rm->rm_col[c].rc_error != 0; c++)
+ continue;
+
+ tgts[current] = c;
+ current++;
+
+ } while (current != n);
+ }
+ }
+ n--;
+done:
+ for (i = 0; i < n; i++) {
+ zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+ }
+
+ return (ret);
+}
+
static void
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_t *cvd;
raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc, *rc1;
+ raidz_col_t *rc;
int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c, c1;
+ int n, c;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int code;
ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
@@ -859,9 +1934,8 @@ vdev_raidz_io_done(zio_t *zio)
* any errors.
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- switch (data_errors) {
- case 0:
- if (zio_checksum_error(zio) == 0) {
+ if (data_errors == 0) {
+ if (raidz_checksum_verify(zio) == 0) {
/*
* If we read parity information (unnecessarily
* as it happens since no reconstruction was
@@ -880,9 +1954,7 @@ vdev_raidz_io_done(zio_t *zio)
}
goto done;
}
- break;
-
- case 1:
+ } else {
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
@@ -894,45 +1966,38 @@ vdev_raidz_io_done(zio_t *zio)
ASSERT(parity_errors < rm->rm_firstdatacol);
/*
- * Find the column that reported the error.
+ * Identify the data columns that reported an error.
*/
+ n = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
}
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- vdev_raidz_reconstruct_p(rm, c);
- } else {
- ASSERT(rm->rm_firstdatacol > 1);
- vdev_raidz_reconstruct_q(rm, c);
- }
+ ASSERT(rm->rm_firstdatacol >= n);
- if (zio_checksum_error(zio) == 0) {
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
- atomic_inc_64(&raidz_corrected_p);
- else
- atomic_inc_64(&raidz_corrected_q);
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
/*
- * If there's more than one parity disk that
- * was successfully read, confirm that the
- * other parity disk produced the correct data.
- * This routine is suboptimal in that it
- * regenerates both the parity we wish to test
- * as well as the parity we just used to
- * perform the reconstruction, but this should
- * be a relatively uncommon case, and can be
- * optimized if it becomes a problem.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
+ * If we read more parity disks than were used
+ * for reconstruction, confirm that the other
+ * parity disks produced correct data. This
+ * routine is suboptimal in that it regenerates
+ * the parity that we already used in addition
+ * to the parity that we're attempting to
+ * verify, but this should be a relatively
+ * uncommon case, and can be optimized if it
+ * becomes a problem. Note that we regenerate
+ * parity when resilvering so we can write it
+ * out to failed devices later.
*/
- if (parity_errors < rm->rm_firstdatacol - 1 ||
+ if (parity_errors < rm->rm_firstdatacol - n ||
(zio->io_flags & ZIO_FLAG_RESILVER)) {
n = raidz_parity_verify(zio, rm);
unexpected_errors += n;
@@ -942,46 +2007,6 @@ vdev_raidz_io_done(zio_t *zio)
goto done;
}
- break;
-
- case 2:
- /*
- * Two data column errors require double parity.
- */
- ASSERT(rm->rm_firstdatacol == 2);
-
- /*
- * Find the two columns that reported errors.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- for (c1 = c++; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- vdev_raidz_reconstruct_pq(rm, c1, c);
-
- if (zio_checksum_error(zio) == 0) {
- atomic_inc_64(&raidz_corrected_pq);
- goto done;
- }
- break;
-
- default:
- ASSERT(rm->rm_firstdatacol <= 2);
- ASSERT(0);
}
}
@@ -1020,145 +2045,54 @@ vdev_raidz_io_done(zio_t *zio)
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. Before we attempt combinatorial reconstruction make
- * sure we have a chance of coming up with the right answer.
+ * in absent data. We check if there is enough additional data to
+ * possibly reconstruct the data and then perform combinatorial
+ * reconstruction over all possible combinations. If that fails,
+ * we're cooked.
*/
- if (total_errors >= rm->rm_firstdatacol) {
+ if (total_errors > rm->rm_firstdatacol) {
zio->io_error = vdev_raidz_worst_error(rm);
- /*
- * If there were exactly as many device errors as parity
- * columns, yet we couldn't reconstruct the data, then at
- * least one device must have returned bad data silently.
- */
- if (total_errors == rm->rm_firstdatacol)
- zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
- goto done;
- }
-
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from parity P.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_p(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_p);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
- if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ } else if (total_errors < rm->rm_firstdatacol &&
+ (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
/*
- * Attempt to reconstruct the data from parity Q.
+ * If we didn't use all the available parity for the
+ * combinatorial reconstruction, verify that the remaining
+ * parity is correct.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_q(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_q);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- if (rm->rm_firstdatacol > 1 &&
- rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
- rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ if (code != (1 << rm->rm_firstdatacol) - 1)
+ (void) raidz_parity_verify(zio, rm);
+ } else {
/*
- * Attempt to reconstruct the data from both P and Q.
+ * We're here because either:
+ *
+ * total_errors == rm_first_datacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
- void *orig, *orig1;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
-
- for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
- rc1 = &rm->rm_col[c1];
-
- orig1 = zio_buf_alloc(rc1->rc_size);
- bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
- vdev_raidz_reconstruct_pq(rm, c, c1);
+ zio->io_error = ECKSUM;
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
- atomic_inc_64(&raidz_corrected_pq);
-
- /*
- * If these children didn't know they
- * returned bad data, inform them.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- if (rc1->rc_tried && rc1->rc_error == 0)
- raidz_checksum_error(zio, rc1);
-
- rc->rc_error = ECKSUM;
- rc1->rc_error = ECKSUM;
-
- goto done;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
+
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
}
-
- bcopy(orig1, rc1->rc_data, rc1->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
}
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- /*
- * All combinations failed to checksum. Generate checksum ereports for
- * all children.
- */
- zio->io_error = ECKSUM;
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
- rc->rc_offset, rc->rc_size);
}
}
@@ -1205,6 +2139,8 @@ vdev_ops_t vdev_raidz_ops = {
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
index 88383f002b80..879f78f3a5b3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors)
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- int c;
int lasterror = 0;
int numerrors = 0;
@@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
- int error;
- if ((error = vdev_open(cvd)) != 0 &&
- !cvd->vdev_islog) {
- lasterror = error;
+ if (cvd->vdev_open_error && !cvd->vdev_islog) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
- continue;
}
}
@@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
static void
vdev_root_close(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
@@ -113,6 +109,8 @@ vdev_ops_t vdev_root_ops = {
NULL, /* io_start - not applicable to the root */
NULL, /* io_done - not applicable to the root */
vdev_root_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 7abe63ac917d..288a4d99ab25 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -19,13 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* This file contains the top half of the zfs directory structure
* implementation. The bottom half is in zap_leaf.c.
@@ -45,11 +41,11 @@
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/zfs_znode.h>
int fzap_default_block_shift = 14; /* 16k blocksize */
@@ -73,7 +69,7 @@ fzap_byteswap(void *vbuf, size_t size)
}
void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
{
dmu_buf_t *db;
zap_leaf_t *l;
@@ -86,7 +82,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
&zap->zap_f.zap_phys, zap_evict);
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
zp = zap->zap_f.zap_phys;
@@ -105,6 +101,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
zp->zap_num_entries = 0;
zp->zap_salt = zap->zap_salt;
zp->zap_normflags = zap->zap_normflags;
+ zp->zap_flags = flags;
/* block 1 will be the first leaf */
for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -114,7 +111,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
* set up block 1 - the first leaf
*/
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -175,20 +172,20 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
b = tbl->zt_blks_copied;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
if (err)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+0) << bs, FTAG, &db_new));
+ (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+1) << bs, FTAG, &db_new));
+ (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
db_new->db_data, hepb);
@@ -236,7 +233,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
dmu_buf_will_dirty(db, tx);
@@ -248,7 +245,8 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
dmu_buf_t *db2;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+ DMU_READ_NO_PREFETCH);
if (err) {
dmu_buf_rele(db, FTAG);
return (err);
@@ -279,7 +277,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@@ -294,7 +292,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
blk = (idx*2) >> (bs-3);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+ DMU_READ_NO_PREFETCH);
dmu_buf_rele(db, FTAG);
}
return (err);
@@ -318,8 +317,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
static int
zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
{
- /* In case things go horribly wrong. */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ /*
+ * The pointer table should never use more hash bits than we
+ * have (otherwise we'd be using useless zero bits to index it).
+ * If we are within 2 bits of running out, stop growing, since
+ * this is already an aberrant condition.
+ */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
return (ENOSPC);
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -338,7 +342,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
newblk = zap_allocate_blocks(zap, 1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+ DMU_READ_NO_PREFETCH);
if (err)
return (err);
dmu_buf_will_dirty(db_new, tx);
@@ -389,14 +394,15 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
l->l_phys = NULL;
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+ DMU_READ_NO_PREFETCH));
winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
ASSERT(winner == NULL);
dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -447,7 +453,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
ASSERT(blkid != 0);
l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = blkid;
l->l_bs = highbit(db->db_size)-1;
@@ -499,7 +505,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- blkid << bs, NULL, &db);
+ blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
@@ -703,13 +709,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
}
}
-
static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checkname(zap_name_t *zn)
{
- if (name && strlen(name) > ZAP_MAXNAMELEN)
- return (E2BIG);
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+ return (0);
+}
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
/* Only integer sizes supported by C */
switch (integer_size) {
case 1:
@@ -727,6 +737,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
return (0);
}
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+ int err;
+
+ if ((err = fzap_checkname(zn)) != 0)
+ return (err);
+ return (fzap_checksize(integer_size, num_integers));
+}
+
/*
* Routines for manipulating attributes.
*/
@@ -739,8 +759,7 @@ fzap_lookup(zap_name_t *zn,
int err;
zap_entry_handle_t zeh;
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
- if (err != 0)
+ if ((err = fzap_checkname(zn)) != 0)
return (err);
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -748,8 +767,13 @@ fzap_lookup(zap_name_t *zn,
return (err);
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
+ if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+ zap_put_leaf(l);
+ return (err);
+ }
+
err = zap_entry_read(&zeh, integer_size, num_integers, buf);
- (void) zap_entry_read_name(&zeh, rn_len, realname);
+ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
if (ncp) {
*ncp = zap_entry_normalization_conflict(&zeh,
zn, NULL, zn->zn_zap);
@@ -772,8 +796,7 @@ fzap_add_cd(zap_name_t *zn,
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(!zap->zap_ismicro);
- ASSERT(fzap_checksize(zn->zn_name_orij,
- integer_size, num_integers) == 0);
+ ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
@@ -787,7 +810,7 @@ retry:
if (err != ENOENT)
goto out;
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+ err = zap_entry_create(l, zn, cd,
integer_size, num_integers, val, &zeh);
if (err == 0) {
@@ -810,12 +833,12 @@ fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
- int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ int err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
- val, ZAP_MAXCD, tx));
+ val, ZAP_NEED_CD, tx));
}
int
@@ -828,7 +851,7 @@ fzap_update(zap_name_t *zn,
zap_t *zap = zn->zn_zap;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
return (err);
@@ -841,8 +864,8 @@ retry:
ASSERT(err == 0 || err == ENOENT);
if (create) {
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
- ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+ err = zap_entry_create(l, zn, ZAP_NEED_CD,
+ integer_size, num_integers, val, &zeh);
if (err == 0)
zap_increment_num_entries(zap, 1, tx);
} else {
@@ -904,6 +927,21 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
return (err);
}
+void
+fzap_prefetch(zap_name_t *zn)
+{
+ uint64_t idx, blk;
+ zap_t *zap = zn->zn_zap;
+ int bs;
+
+ idx = ZAP_HASH_IDX(zn->zn_hash,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+}
+
/*
* Helper functions for consumers.
*/
@@ -955,6 +993,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
}
int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+ err = zap_add(os, intoobj, za.za_name,
+ 8, 1, &value, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ uint64_t delta = 0;
+
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+
+ err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ delta += za.za_first_integer;
+ err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
{
char name[20];
@@ -981,6 +1069,56 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
return (zap_lookup(os, obj, name, 8, 1, &value));
}
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx)
+{
+ uint64_t value = 0;
+ int err;
+
+ if (delta == 0)
+ return (0);
+
+ err = zap_lookup(os, obj, name, 8, 1, &value);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ value += delta;
+ if (value == 0)
+ err = zap_remove(os, obj, name, tx);
+ else
+ err = zap_update(os, obj, name, 8, 1, &value, tx);
+ return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_increment(os, obj, name, delta, tx));
+}
+
/*
* Routines for iterating over the attributes.
*/
@@ -1042,7 +1180,7 @@ again:
err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
ASSERT(err == 0 || err == EOVERFLOW);
}
- err = zap_entry_read_name(&zeh,
+ err = zap_entry_read_name(zap, &zeh,
sizeof (za->za_name), za->za_name);
ASSERT(err == 0);
@@ -1054,7 +1192,6 @@ again:
return (err);
}
-
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
@@ -1081,6 +1218,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
}
}
+int
+fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
+{
+ int err;
+ zap_leaf_t *l;
+ zap_entry_handle_t zeh;
+
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ return (err);
+
+ zc->zc_leaf = l;
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+
+ return (err);
+}
+
void
fzap_get_stats(zap_t *zap, zap_stats_t *zs)
{
@@ -1126,7 +1288,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
- FTAG, &db);
+ FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {
zap_stats_ptrtbl(zap, db->db_data,
1<<(bs-3), zs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index da498b6bc9e3..19a795db825b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -19,24 +19,24 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* The 512-byte leaf is broken into 32 16-byte chunks.
* chunk number n means l_chunk[n], even though the header precedes it.
* the names are stored null-terminated.
*/
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
+#include <sys/arc.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
@@ -127,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
le = &lc->l_entry;
le->le_type = BSWAP_8(le->le_type);
- le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_value_intlen = BSWAP_8(le->le_value_intlen);
le->le_next = BSWAP_16(le->le_next);
le->le_name_chunk = BSWAP_16(le->le_name_chunk);
- le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_name_numints = BSWAP_16(le->le_name_numints);
le->le_value_chunk = BSWAP_16(le->le_value_chunk);
- le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_value_numints = BSWAP_16(le->le_value_numints);
le->le_cd = BSWAP_32(le->le_cd);
le->le_hash = BSWAP_64(le->le_hash);
break;
@@ -215,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
static uint16_t
zap_leaf_array_create(zap_leaf_t *l, const char *buf,
- int integer_size, int num_integers)
+ int integer_size, int num_integers)
{
uint16_t chunk_head;
uint16_t *chunkp = &chunk_head;
@@ -273,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
static void
zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
- char *buf)
+ void *buf)
{
int len = MIN(array_len, buf_len);
int byten = 0;
uint64_t value = 0;
+ char *p = buf;
ASSERT3U(array_int_len, <=, buf_int_len);
@@ -285,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
uint8_t *ip = la->la_array;
- uint64_t *buf64 = (uint64_t *)buf;
+ uint64_t *buf64 = buf;
*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
(uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -300,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
while (chunk != CHAIN_END) {
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
- bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
- buf += ZAP_LEAF_ARRAY_BYTES;
+ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ p += ZAP_LEAF_ARRAY_BYTES;
chunk = la->la_next;
}
return;
@@ -316,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
- stv(buf_int_len, buf, value);
+ stv(buf_int_len, p, value);
byten = 0;
len--;
if (len == 0)
return;
- buf += buf_int_len;
+ p += buf_int_len;
}
}
chunk = la->la_next;
}
}
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * namenorm is null-terminated.
- */
static boolean_t
-zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+ int chunk, int array_numints)
{
int bseen = 0;
+ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+ uint64_t *thiskey;
+ boolean_t match;
+
+ ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+ thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
+ KM_SLEEP);
+
+ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+ sizeof (*thiskey), array_numints, thiskey);
+ match = bcmp(thiskey, zn->zn_key_orig,
+ array_numints * sizeof (*thiskey)) == 0;
+ kmem_free(thiskey, array_numints * sizeof (*thiskey));
+ return (match);
+ }
+
+ ASSERT(zn->zn_key_intlen == 1);
if (zn->zn_matchtype == MT_FIRST) {
- char *thisname = kmem_alloc(array_len, KM_SLEEP);
+ char *thisname = kmem_alloc(array_numints, KM_SLEEP);
boolean_t match;
- zap_leaf_array_read(l, chunk, 1, array_len, 1,
- array_len, thisname);
+ zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+ sizeof (char), array_numints, thisname);
match = zap_match(zn, thisname);
- kmem_free(thisname, array_len);
+ kmem_free(thisname, array_numints);
return (match);
}
- /* Fast path for exact matching */
- while (bseen < array_len) {
+ /*
+ * Fast path for exact matching.
+ * First check that the lengths match, so that we don't read
+ * past the end of the zn_key_orig array.
+ */
+ if (array_numints != zn->zn_key_orig_numints)
+ return (B_FALSE);
+ while (bseen < array_numints) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
break;
chunk = la->la_next;
bseen += toread;
}
- return (bseen == array_len);
+ return (bseen == array_numints);
}
/*
@@ -394,9 +414,9 @@ again:
ASSERT(zn->zn_matchtype == MT_EXACT ||
(l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
if (zap_leaf_array_match(l, zn, le->le_name_chunk,
- le->le_name_length)) {
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
+ le->le_name_numints)) {
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
@@ -427,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
{
uint16_t chunk;
uint64_t besth = -1ULL;
- uint32_t bestcd = ZAP_MAXCD;
+ uint32_t bestcd = -1U;
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
uint16_t lh;
struct zap_leaf_entry *le;
@@ -449,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
besth = le->le_hash;
bestcd = le->le_cd;
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_fakechunk = chunk;
@@ -460,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
}
}
- return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+ return (bestcd == -1U ? ENOENT : 0);
}
int
@@ -471,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh,
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- if (le->le_int_size > integer_size)
+ if (le->le_value_intlen > integer_size)
return (EINVAL);
- zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
- le->le_value_length, integer_size, num_integers, buf);
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+ le->le_value_intlen, le->le_value_numints,
+ integer_size, num_integers, buf);
if (zeh->zeh_num_integers > num_integers)
return (EOVERFLOW);
@@ -484,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh,
}
int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+ char *buf)
{
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
- le->le_name_length, 1, buflen, buf);
- if (le->le_name_length > buflen)
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+ le->le_name_numints, 8, buflen / 8, buf);
+ } else {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_numints, 1, buflen, buf);
+ }
+ if (le->le_name_numints > buflen)
return (EOVERFLOW);
return (0);
}
@@ -506,24 +533,16 @@ zap_entry_update(zap_entry_handle_t *zeh,
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
return (EAGAIN);
- /*
- * We should search other chained leaves (via
- * zap_entry_remove,create?) otherwise returning EAGAIN will
- * just send us into an infinite loop if we have to chain
- * another leaf block, rather than being able to split this
- * block.
- */
-
zap_leaf_array_free(l, &le->le_value_chunk);
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
return (0);
}
@@ -550,26 +569,25 @@ zap_entry_remove(zap_entry_handle_t *zeh)
}
int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
uint8_t integer_size, uint64_t num_integers, const void *buf,
zap_entry_handle_t *zeh)
{
uint16_t chunk;
uint16_t *chunkp;
struct zap_leaf_entry *le;
- uint64_t namelen, valuelen;
+ uint64_t valuelen;
int numchunks;
+ uint64_t h = zn->zn_hash;
valuelen = integer_size * num_integers;
- namelen = strlen(name) + 1;
- ASSERT(namelen >= 2);
- numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
- ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (E2BIG);
- if (cd == ZAP_MAXCD) {
+ if (cd == ZAP_NEED_CD) {
/* find the lowest unused cd */
if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
cd = 0;
@@ -586,7 +604,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
}
} else {
/* old unsorted format; do it the O(n^2) way */
- for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (cd = 0; ; cd++) {
for (chunk = *LEAF_HASH_ENTPTR(l, h);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -601,10 +619,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
}
}
/*
- * we would run out of space in a block before we could
- * have ZAP_MAXCD entries
+ * We would run out of space in a block before we could
+ * store enough entries to run out of CD values.
*/
- ASSERT3U(cd, <, ZAP_MAXCD);
+ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
}
if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -614,12 +632,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
chunk = zap_leaf_chunk_alloc(l);
le = ZAP_LEAF_ENTRY(l, chunk);
le->le_type = ZAP_CHUNK_ENTRY;
- le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
- le->le_name_length = namelen;
+ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+ zn->zn_key_intlen, zn->zn_key_orig_numints);
+ le->le_name_numints = zn->zn_key_orig_numints;
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
le->le_hash = h;
le->le_cd = cd;
@@ -631,7 +650,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
zeh->zeh_leaf = l;
zeh->zeh_num_integers = num_integers;
- zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
@@ -673,7 +692,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
allocdzn = B_TRUE;
}
if (zap_leaf_array_match(zeh->zeh_leaf, zn,
- le->le_name_chunk, le->le_name_length)) {
+ le->le_name_chunk, le->le_name_numints)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
@@ -836,9 +855,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(l, chunk);
- n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
- le->le_int_size);
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+ le->le_value_intlen);
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_entries_using_n_chunks[n]++;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 9453fd293870..b40309741dfa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
@@ -33,38 +31,98 @@
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/avl.h>
+#include <sys/arc.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
#endif
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+uint64_t
+zap_getflags(zap_t *zap)
+{
+ if (zap->zap_ismicro)
+ return (0);
+ return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return (48);
+ else
+ return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return ((1<<16)-1);
+ else
+ return (-1U);
+}
static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+zap_hash(zap_name_t *zn)
{
- const uint8_t *cp;
- uint8_t c;
- uint64_t crc = zap->zap_salt;
+ zap_t *zap = zn->zn_zap;
+ uint64_t h = 0;
- /* NB: name must already be normalized, if necessary */
+ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+ h = *(uint64_t *)zn->zn_key_orig;
+ } else {
+ h = zap->zap_salt;
+ ASSERT(h != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ int i;
+ const uint64_t *wp = zn->zn_key_norm;
+
+ ASSERT(zn->zn_key_intlen == 8);
+ for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
+ int j;
+ uint64_t word = *wp;
+
+ for (j = 0; j < zn->zn_key_intlen; j++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ word) & 0xFF];
+ word >>= NBBY;
+ }
+ }
+ } else {
+ int i, len;
+ const uint8_t *cp = zn->zn_key_norm;
- ASSERT(crc != 0);
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
- }
+ /*
+ * We previously stored the terminating null on
+ * disk, but didn't hash it, so we need to
+ * continue to not hash it. (The
+ * zn_key_*_numints includes the terminating
+ * null for non-binary keys.)
+ */
+ len = zn->zn_key_norm_numints - 1;
+ ASSERT(zn->zn_key_intlen == 1);
+ for (i = 0; i < len; cp++, i++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ *cp) & 0xFF];
+ }
+ }
+ }
/*
- * Only use 28 bits, since we need 4 bits in the cookie for the
- * collision differentiator. We MUST use the high bits, since
- * those are the ones that we first pay attention to when
+ * Don't use all 64 bits, since we need some in the cookie for
+ * the collision differentiator. We MUST use the high bits,
+ * since those are the ones that we first pay attention to when
* chosing the bucket.
*/
- crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
- return (crc);
+ return (h);
}
static int
@@ -73,13 +131,15 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
size_t inlen, outlen;
int err;
+ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
inlen = strlen(name) + 1;
outlen = ZAP_MAXNAMELEN;
err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
- zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
- &err);
+ zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
+ U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
return (err);
}
@@ -87,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
boolean_t
zap_match(zap_name_t *zn, const char *matchname)
{
+ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
if (zn->zn_matchtype == MT_FIRST) {
char norm[ZAP_MAXNAMELEN];
if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
return (B_FALSE);
- return (strcmp(zn->zn_name_norm, norm) == 0);
+ return (strcmp(zn->zn_key_norm, norm) == 0);
} else {
/* MT_BEST or MT_EXACT */
- return (strcmp(zn->zn_name_orij, matchname) == 0);
+ return (strcmp(zn->zn_key_orig, matchname) == 0);
}
}
@@ -106,30 +168,49 @@ zap_name_free(zap_name_t *zn)
kmem_free(zn, sizeof (zap_name_t));
}
-/* XXX combine this with zap_lockdir()? */
zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
zn->zn_zap = zap;
- zn->zn_name_orij = name;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = key;
+ zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
zn->zn_matchtype = mt;
if (zap->zap_normflags) {
- if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+ if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_normbuf;
+ zn->zn_key_norm = zn->zn_normbuf;
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else {
if (mt != MT_EXACT) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_name_orij;
+ zn->zn_key_norm = zn->zn_key_orig;
+ zn->zn_key_norm_numints = zn->zn_key_orig_numints;
}
- zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+ zn->zn_hash = zap_hash(zn);
+ return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ ASSERT(zap->zap_normflags == 0);
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = zn->zn_key_norm = key;
+ zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+ zn->zn_matchtype = MT_EXACT;
+
+ zn->zn_hash = zap_hash(zn);
return (zn);
}
@@ -174,27 +255,27 @@ mze_compare(const void *arg1, const void *arg2)
return (+1);
if (mze1->mze_hash < mze2->mze_hash)
return (-1);
- if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd > mze2->mze_cd)
return (+1);
- if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd < mze2->mze_cd)
return (-1);
return (0);
}
static int
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
{
mzap_ent_t *mze;
avl_index_t idx;
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(mzep->mze_cd < ZAP_MAXCD);
mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
mze->mze_hash = hash;
- mze->mze_phys = *mzep;
+ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
kmem_free(mze, sizeof (mzap_ent_t));
return (EEXIST);
@@ -214,18 +295,16 @@ mze_find(zap_name_t *zn)
ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
- if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
- return (NULL);
-
mze_tofind.mze_hash = zn->zn_hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
again:
mze = avl_find(avl, &mze_tofind, &idx);
if (mze == NULL)
mze = avl_nearest(avl, idx, AVL_AFTER);
for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
- if (zap_match(zn, mze->mze_phys.mze_name))
+ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze);
}
if (zn->zn_matchtype == MT_BEST) {
@@ -248,12 +327,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
mze_tofind.mze_hash = hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
cd = 0;
for (mze = avl_find(avl, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
- if (mze->mze_phys.mze_cd != cd)
+ if (mze->mze_cd != cd)
break;
cd++;
}
@@ -292,15 +371,14 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
- rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&zap->zap_rwlock, 0, 0, 0);
rw_enter(&zap->zap_rwlock, RW_WRITER);
zap->zap_objset = os;
zap->zap_object = obj;
zap->zap_dbuf = db;
if (*(uint64_t *)db->db_data != ZBT_MICRO) {
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
- MUTEX_DEFAULT, 0);
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
} else {
zap->zap_ismicro = TRUE;
@@ -337,7 +415,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zn = zap_name_alloc(zap, mze->mze_name,
MT_EXACT);
- if (mze_insert(zap, i, zn->zn_hash, mze) == 0)
+ if (mze_insert(zap, i, zn->zn_hash) == 0)
zap->zap_m.zap_num_entries++;
else {
printf("ZFS WARNING: Duplicated ZAP "
@@ -385,7 +463,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
*zapp = NULL;
- err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
@@ -435,7 +513,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
- return (mzap_upgrade(zapp, tx));
+ return (mzap_upgrade(zapp, tx, 0));
}
err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
ASSERT3U(err, ==, 0);
@@ -455,10 +533,11 @@ zap_unlockdir(zap_t *zap)
}
static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
- int i, sz, nchunks, err;
+ int i, sz, nchunks;
+ int err = 0;
zap_t *zap = *zapp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -468,11 +547,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
- err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
- 1ULL << fzap_default_block_shift, 0, tx);
- if (err) {
- kmem_free(mzp, sz);
- return (err);
+ if (!flags) {
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err) {
+ kmem_free(mzp, sz);
+ return (err);
+ }
}
dprintf("upgrading obj=%llu with %u chunks\n",
@@ -480,10 +561,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
/* XXX destroy the avl later, so we can use the stored hash value */
mze_destroy(zap);
- fzap_upgrade(zap, tx);
+ fzap_upgrade(zap, tx, flags);
for (i = 0; i < nchunks; i++) {
- int err;
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
zap_name_t *zn;
if (mze->mze_name[0] == 0)
@@ -503,12 +583,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
}
static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+ dmu_tx_t *tx)
{
dmu_buf_t *db;
mzap_phys_t *zp;
- VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
#ifdef ZFS_DEBUG
{
@@ -524,6 +605,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
zp->mz_normflags = normflags;
dmu_buf_rele(db, FTAG);
+
+ if (flags != 0) {
+ zap_t *zap;
+ /* Only fat zap supports flags; upgrade immediately. */
+ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+ zap_unlockdir(zap);
+ }
}
int
@@ -544,7 +634,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
if (err != 0)
return (err);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
return (0);
}
@@ -561,7 +651,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
{
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
+ return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+ leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+ indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+ VERIFY(dmu_object_set_blocksize(os, obj,
+ 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+ mzap_create_impl(os, obj, normflags, flags, tx);
return (obj);
}
@@ -631,11 +740,11 @@ again:
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
if (zn == NULL) {
- zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
MT_FIRST);
allocdzn = B_TRUE;
}
- if (zap_match(zn, other->mze_phys.mze_name)) {
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
@@ -697,9 +806,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
} else if (integer_size != 8) {
err = EINVAL;
} else {
- *(uint64_t *)buf = mze->mze_phys.mze_value;
+ *(uint64_t *)buf =
+ MZE_PHYS(zap, mze)->mze_value;
(void) strlcpy(realname,
- mze->mze_phys.mze_name, rn_len);
+ MZE_PHYS(zap, mze)->mze_name, rn_len);
if (ncp) {
*ncp = mzap_normalization_conflict(zap,
zn, mze);
@@ -713,6 +823,63 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ fzap_prefetch(zn);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ NULL, 0, NULL);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+ int err = (zap_lookup_norm(os, zapobj, name, 0,
+ 0, NULL, MT_EXACT, NULL, 0, NULL));
+ if (err == EOVERFLOW || err == EINVAL)
+ err = 0; /* found, but skipped reading the value */
+ return (err);
+}
+
+int
zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers)
{
@@ -747,6 +914,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_length(zn, integer_size, num_integers);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
@@ -755,20 +944,18 @@ mzap_addent(zap_name_t *zn, uint64_t value)
int start = zap->zap_m.zap_alloc_next;
uint32_t cd;
- dprintf("obj=%llu %s=%llu\n", zap->zap_object,
- zn->zn_name_orij, value);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
#ifdef ZFS_DEBUG
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
- ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
cd = mze_find_unused_cd(zap, zn->zn_hash);
/* given the limited size of the microzap, this can't happen */
- ASSERT(cd != ZAP_MAXCD);
+ ASSERT(cd < zap_maxcd(zap));
again:
for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -776,13 +963,13 @@ again:
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
mze->mze_cd = cd;
- (void) strcpy(mze->mze_name, zn->zn_name_orij);
+ (void) strcpy(mze->mze_name, zn->zn_key_orig);
zap->zap_m.zap_num_entries++;
zap->zap_m.zap_alloc_next = i+1;
if (zap->zap_m.zap_alloc_next ==
zap->zap_m.zap_num_chunks)
zap->zap_m.zap_alloc_next = 0;
- VERIFY(0 == mze_insert(zap, i, zn->zn_hash, mze));
+ VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
return;
}
}
@@ -794,7 +981,7 @@ again:
}
int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
@@ -807,7 +994,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
- zn = zap_name_alloc(zap, name, MT_EXACT);
+ zn = zap_name_alloc(zap, key, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
return (ENOTSUP);
@@ -816,10 +1003,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
- strlen(name) >= MZAP_NAME_LEN) {
- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
- zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ strlen(key) >= MZAP_NAME_LEN) {
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
@@ -839,15 +1024,50 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
mzap_ent_t *mze;
+ uint64_t oldval;
const uint64_t *intval = val;
zap_name_t *zn;
int err;
+#ifdef ZFS_DEBUG
+ /*
+ * If there is an old value, it shouldn't change across the
+ * lockdir (eg, due to bprewrite's xlation).
+ */
+ if (integer_size == 8 && num_integers == 1)
+ (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
@@ -863,7 +1083,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_update(zn, integer_size, num_integers,
val, tx);
@@ -871,9 +1091,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
} else {
mze = mze_find(zn);
if (mze != NULL) {
- mze->mze_phys.mze_value = *intval;
- zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid].mze_value = *intval;
+ ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+ MZE_PHYS(zap, mze)->mze_value = *intval;
} else {
mzap_addent(zn, *intval);
}
@@ -886,6 +1105,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ zap_name_t *zn;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_update(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
{
return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
@@ -926,17 +1170,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
/*
* Routines for iterating over the attributes.
*/
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this. So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
@@ -945,15 +1204,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
zc->zc_zapobj = zapobj;
- if (serialized == -1ULL) {
- zc->zc_hash = -1ULL;
- zc->zc_cd = 0;
- } else {
- zc->zc_hash = serialized << (64-ZAP_HASHBITS);
- zc->zc_cd = serialized >> ZAP_HASHBITS;
- if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
- zc->zc_cd = 0;
- }
+ zc->zc_serialized = serialized;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
}
void
@@ -983,10 +1236,21 @@ zap_cursor_serialize(zap_cursor_t *zc)
{
if (zc->zc_hash == -1ULL)
return (-1ULL);
- ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
- ASSERT(zc->zc_cd < ZAP_MAXCD);
- return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
- ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+ if (zc->zc_zap == NULL)
+ return (zc->zc_serialized);
+ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+ /*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So usually use a small
+ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+ * of the cursor.
+ *
+ * [ collision differentiator | zap_hashbits()-bit hash value ]
+ */
+ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
}
int
@@ -1001,10 +1265,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
return (ENOENT);
if (zc->zc_zap == NULL) {
+ int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, &zc->zc_zap);
if (err)
return (err);
+
+ /*
+ * To support zap_cursor_init_serialized, advance, retrieve,
+ * we must add to the existing zc_cd, which may already
+ * be 1 due to the zap_cursor_advance.
+ */
+ ASSERT(zc->zc_hash == 0);
+ hb = zap_hashbits(zc->zc_zap);
+ zc->zc_hash = zc->zc_serialized << (64 - hb);
+ zc->zc_cd += zc->zc_serialized >> hb;
+ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+ zc->zc_cd = 0;
} else {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
}
@@ -1014,7 +1291,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
err = ENOENT;
mze_tofind.mze_hash = zc->zc_hash;
- mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+ mze_tofind.mze_cd = zc->zc_cd;
mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
if (mze == NULL) {
@@ -1022,18 +1299,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
idx, AVL_AFTER);
}
if (mze) {
- ASSERT(0 == bcmp(&mze->mze_phys,
- &zc->zc_zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict =
mzap_normalization_conflict(zc->zc_zap, NULL, mze);
za->za_integer_length = 8;
za->za_num_integers = 1;
- za->za_first_integer = mze->mze_phys.mze_value;
- (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ za->za_first_integer = mzep->mze_value;
+ (void) strcpy(za->za_name, mzep->mze_name);
zc->zc_hash = mze->mze_hash;
- zc->zc_cd = mze->mze_phys.mze_cd;
+ zc->zc_cd = mze->mze_cd;
err = 0;
} else {
zc->zc_hash = -1ULL;
@@ -1049,12 +1324,46 @@ zap_cursor_advance(zap_cursor_t *zc)
if (zc->zc_hash == -1ULL)
return;
zc->zc_cd++;
- if (zc->zc_cd >= ZAP_MAXCD) {
- zc->zc_cd = 0;
- zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
- if (zc->zc_hash == 0) /* EOF */
- zc->zc_hash = -1ULL;
+}
+
+int
+zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
+{
+ int err = 0;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+
+ zn = zap_name_alloc(zc->zc_zap, name, mt);
+ if (zn == NULL) {
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (ENOTSUP);
+ }
+
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_move_to_key(zc, zn);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = ENOENT;
+ goto out;
+ }
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_cd;
}
+
+out:
+ zap_name_free(zn);
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
}
int
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index fc25bfe1de1f..f893383a5370 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -48,6 +47,7 @@
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <acl/acl_common.h>
#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
@@ -71,8 +71,7 @@
#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
ACE_DELETE|ACE_DELETE_CHILD)
-#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
- ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@@ -319,6 +318,117 @@ static acl_ops_t zfs_acl_fuid_ops = {
zfs_ace_fuid_data
};
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
static int
zfs_acl_version(int version)
{
@@ -334,7 +444,7 @@ zfs_acl_version_zp(znode_t *zp)
return (zfs_acl_version(zp->z_zfsvfs->z_version));
}
-static zfs_acl_t *
+zfs_acl_t *
zfs_acl_alloc(int vers)
{
zfs_acl_t *aclp;
@@ -350,7 +460,7 @@ zfs_acl_alloc(int vers)
return (aclp);
}
-static zfs_acl_node_t *
+zfs_acl_node_t *
zfs_acl_node_alloc(size_t bytes)
{
zfs_acl_node_t *aclnode;
@@ -461,6 +571,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
{
zfs_acl_node_t *aclnode;
+ ASSERT(aclp);
+
if (start == NULL) {
aclnode = list_head(&aclp->z_acl);
if (aclnode == NULL)
@@ -507,6 +619,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
*who = aclp->z_ops.ace_who_get(acep);
aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
aclnode->z_ace_idx++;
+
return ((void *)acep);
}
return (NULL);
@@ -540,7 +653,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
*/
int
zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
- void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
zfs_fuid_info_t **fuidp, cred_t *cr)
{
int i;
@@ -771,8 +884,9 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
* Determine mode of file based on ACL.
* Also, create FUIDs for any User/Group ACEs
*/
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
{
int entry_type;
mode_t mode;
@@ -783,7 +897,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
uint32_t access_mask;
boolean_t an_exec_denied = B_FALSE;
- mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
while (acep = zfs_acl_next_ace(aclp, acep, &who,
&access_mask, &iflags, &type)) {
@@ -801,7 +915,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
entry_type == OWNING_GROUP))
continue;
- if (entry_type == ACE_OWNER) {
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
if ((access_mask & ACE_READ_DATA) &&
(!(seen & S_IRUSR))) {
seen |= S_IRUSR;
@@ -823,7 +938,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
mode |= S_IXUSR;
}
}
- } else if (entry_type == OWNING_GROUP) {
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
if ((access_mask & ACE_READ_DATA) &&
(!(seen & S_IRGRP))) {
seen |= S_IRGRP;
@@ -928,61 +1044,29 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
an_exec_denied = B_TRUE;
if (an_exec_denied)
- zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
else
- zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+ *pflags |= ZFS_NO_EXECS_DENIED;
return (mode);
}
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
- /*
- * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
- * Version 0 didn't have a size field, only a count.
- */
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
- aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
- } else {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
- }
-
- aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
- aclnode->z_ace_count = aclp->z_acl_count;
- if (will_modify) {
- bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
- aclp->z_acl_bytes);
- } else {
- aclnode->z_size = aclp->z_acl_bytes;
- aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
- }
-
- list_insert_head(&aclp->z_acl, aclnode);
-
- return (aclp);
-}
-
/*
* Read an external acl object. If the intent is to modify, always
* create a new acl and leave any cached acl in place.
*/
static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
{
- uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
zfs_acl_t *aclp;
- size_t aclsize;
- size_t acl_count;
+ int aclsize;
+ int acl_count;
zfs_acl_node_t *aclnode;
- int error;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+ boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
@@ -991,46 +1075,97 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
return (0);
}
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
- *aclpp = zfs_acl_node_read_internal(zp, will_modify);
- if (!will_modify)
- zp->z_acl_cached = *aclpp;
- return (0);
+ /*
+ * close race where znode could be upgrade while trying to
+ * read the znode attributes.
+ *
+ * But this could only happen if the file isn't already an SA
+ * znode
+ */
+ if (!zp->z_is_sa && !have_lock) {
+ mutex_enter(&zp->z_lock);
+ drop_lock = B_TRUE;
}
+ version = zfs_znode_acl_version(zp);
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- zfs_acl_phys_v0_t *zacl0 =
- (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
-
- aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
- acl_count = zacl0->z_acl_count;
- } else {
- aclsize = zp->z_phys->zp_acl.z_acl_size;
- acl_count = zp->z_phys->zp_acl.z_acl_count;
- if (aclsize == 0)
- aclsize = acl_count * sizeof (zfs_ace_t);
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
}
- aclnode = zfs_acl_node_alloc(aclsize);
- list_insert_head(&aclp->z_acl, aclnode);
- error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
- aclnode->z_ace_count = acl_count;
+
+ aclp = zfs_acl_alloc(version);
+
aclp->z_acl_count = acl_count;
aclp->z_acl_bytes = aclsize;
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
if (error != 0) {
zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = EIO;
- return (error);
+ goto done;
}
+ list_insert_head(&aclp->z_acl, aclnode);
+
*aclpp = aclp;
if (!will_modify)
zp->z_acl_cached = aclp;
- return (0);
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
+ return (error);
}
/*
@@ -1043,28 +1178,35 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
int
zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
{
- int error;
- znode_phys_t *zphys = zp->z_phys;
- zfs_acl_phys_t *zacl = &zphys->zp_acl;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
- uint64_t off = 0;
- dmu_object_type_t otype;
- zfs_acl_node_t *aclnode;
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
if (zp->z_acl_cached) {
zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = NULL;
}
- zphys->zp_mode = zfs_mode_compute(zp, aclp);
-
/*
- * Decide which object type to use. If we are forced to
- * use old ACL format then transform ACL into zfs_oldace_t
- * layout.
+ * Upgrade needed?
*/
if (!zfsvfs->z_use_fuids) {
otype = DMU_OT_OLDACL;
@@ -1076,84 +1218,113 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
otype = DMU_OT_ACL;
}
- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- /*
- * If ACL was previously external and we are now
- * converting to new ACL format then release old
- * ACL object and create a new one.
- */
- if (aoid && aclp->z_version != zacl->z_acl_version) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- aoid = 0;
- }
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- otype, aclp->z_acl_bytes,
- otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
- otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ zfs_acl_phys_t acl_phys;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
} else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
- aclp->z_acl_bytes, 0, tx);
- }
- zphys->zp_acl.z_acl_extern_obj = aoid;
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
- off += aclnode->z_size;
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
}
- } else {
- void *start = zacl->z_ace_data;
/*
- * Migrating back embedded?
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
*/
- if (zphys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- zphys->zp_acl.z_acl_extern_obj = 0;
- }
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- bcopy(aclnode->z_acldata, start, aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
}
- }
+ acl_phys.z_acl_version = aclp->z_version;
- /*
- * If Old version then swap count/bytes to match old
- * layout of znode_acl_phys_t.
- */
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- zphys->zp_acl.z_acl_size = aclp->z_acl_count;
- zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
- } else {
- zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
- zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
}
- zphys->zp_acl.z_acl_version = aclp->z_version;
-
/*
* Replace ACL wide bits, but first clear them.
*/
- zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
- zp->z_phys->zp_flags |= aclp->z_hints;
+ zp->z_pflags |= aclp->z_hints;
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
- zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
- return (0);
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
}
/*
@@ -1223,314 +1394,64 @@ zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep,
aclp->z_ops.ace_mask_set(acep, acepmask);
}
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- void *acep;
- int maskoff = aclp->z_ops.ace_mask_off();
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (abstract_size * 6));
-
- /*
- * Fixup final ACEs to match the mode
- */
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0700) >> 6); /* owner@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0070) >> 3); /* group@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
- adjust_ace_pair_common(acep, maskoff,
- abstract_size, mode); /* everyone@ */
-}
-
-
-static int
-zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
- int entry_type, int accessmask)
-{
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t type = aclp->z_ops.ace_type_get(acep);
- uint16_t flags = aclp->z_ops.ace_flags_get(acep);
-
- return (mask == accessmask && type == allow_deny &&
- ((flags & ACE_TYPE_FLAGS) == entry_type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
-{
- int okay_masks;
- uint16_t prevtype;
- uint16_t prevflags;
- uint16_t flags;
- uint32_t mask, prevmask;
-
- if (prevacep == NULL)
- return (B_FALSE);
-
- prevtype = aclp->z_ops.ace_type_get(prevacep);
- prevflags = aclp->z_ops.ace_flags_get(prevacep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- prevmask = aclp->z_ops.ace_mask_get(prevacep);
-
- if (prevtype != DENY)
- return (B_FALSE);
-
- if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
- return (B_FALSE);
-
- okay_masks = (mask & OKAY_MASK_BITS);
-
- if (prevmask & ~okay_masks)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Insert new ACL node into chain of zfs_acl_node_t's
- *
- * This will result in two possible results.
- * 1. If the ACL is currently just a single zfs_acl_node and
- * we are prepending the entry then current acl node will have
- * a new node inserted above it.
- *
- * 2. If we are inserting in the middle of current acl node then
- * the current node will be split in two and new node will be inserted
- * in between the two split nodes.
- */
-static zfs_acl_node_t *
-zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep)
-{
- zfs_acl_node_t *newnode;
- zfs_acl_node_t *trailernode = NULL;
- zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp);
- int curr_idx = aclp->z_curr_node->z_ace_idx;
- int trailer_count;
- size_t oldsize;
-
- newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
- newnode->z_ace_count = 1;
-
- oldsize = currnode->z_size;
-
- if (curr_idx != 1) {
- trailernode = zfs_acl_node_alloc(0);
- trailernode->z_acldata = acep;
-
- trailer_count = currnode->z_ace_count - curr_idx + 1;
- currnode->z_ace_count = curr_idx - 1;
- currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
- trailernode->z_size = oldsize - currnode->z_size;
- trailernode->z_ace_count = trailer_count;
- }
-
- aclp->z_acl_count += 1;
- aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
-
- if (curr_idx == 1)
- list_insert_before(&aclp->z_acl, currnode, newnode);
- else
- list_insert_after(&aclp->z_acl, currnode, newnode);
- if (trailernode) {
- list_insert_after(&aclp->z_acl, newnode, trailernode);
- aclp->z_curr_node = trailernode;
- trailernode->z_ace_idx = 1;
- }
-
- return (newnode);
-}
-
-/*
- * Prepend deny ACE
- */
-static void *
-zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
- mode_t mode)
-{
- zfs_acl_node_t *aclnode;
- void *newacep;
- uint64_t fuid;
- uint16_t flags;
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
- fuid = aclp->z_ops.ace_who_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
- zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
-
- return (newacep);
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
static void
-zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
{
- zfs_acl_node_t *aclnode;
- zfs_acl_node_t *currnode;
- void *newacep;
- uint16_t type, flags;
- uint32_t mask;
- uint64_t fuid;
-
- type = aclp->z_ops.ace_type_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- fuid = aclp->z_ops.ace_who_get(acep);
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
-
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
- aclp->z_ops.ace_mask_set(newacep, mask);
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_who_set(newacep, fuid);
- aclp->z_next_ace = acep;
- flags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep, flags);
- currnode = zfs_acl_curr_node(aclp);
- ASSERT(currnode->z_ace_idx >= 1);
- currnode->z_ace_idx -= 1;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp)
-{
- void *acep;
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- int i = 0;
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- if (aclnode->z_ace_count < 6)
- return (0);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
-
- if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_OWNER, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
- OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
- (abstract_size * i++),
- ALLOW, OWNING_GROUP, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
- return (1);
- } else {
- return (0);
- }
-}
-
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner. If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
- mode_t mode)
-{
- uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
- mode_t extramode = (mode >> 3) & 07;
- mode_t ownermode = (mode >> 6);
-
- if (prevflags & ACE_IDENTIFIER_GROUP) {
-
- extramode &= ~ownermode;
-
- if (extramode) {
- if (extramode & S_IROTH) {
- prevmask &= ~ACE_READ_DATA;
- mask &= ~ACE_READ_DATA;
- }
- if (extramode & S_IWOTH) {
- prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- }
- if (extramode & S_IXOTH) {
- prevmask &= ~ACE_EXECUTE;
- mask &= ~ACE_EXECUTE;
- }
- }
- }
- aclp->z_ops.ace_mask_set(acep, mask);
- aclp->z_ops.ace_mask_set(prevacep, prevmask);
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static void
-zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
- uint64_t mode, zfs_acl_t *aclp)
-{
- void *acep = NULL, *prevacep = NULL;
+ void *acep = NULL;
uint64_t who;
- int i;
+ int new_count, new_bytes;
+ int ace_size;
int entry_type;
- int reuse_deny;
- int need_canonical_six = 1;
uint16_t iflags, type;
uint32_t access_mask;
-
- /*
- * If discard then just discard all ACL nodes which
- * represent the ACEs.
- *
- * New owner@/group@/everone@ ACEs will be added
- * later.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
- zfs_acl_release_nodes(aclp);
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops.ace_abstract_size();
+ void *zacep;
+ uint32_t owner, group, everyone;
+ uint32_t deny1, deny2, allow0;
+
+ new_count = new_bytes = 0;
+
+ acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2,
+ &owner, &group, &everyone);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (allow0) {
+ zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ } if (deny1) {
+ zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (deny2) {
+ zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
+ uint16_t inherit_flags;
entry_type = (iflags & ACE_TYPE_FLAGS);
- iflags = (iflags & ALL_INHERIT);
+ inherit_flags = (iflags & ALL_INHERIT);
+
+ if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) &&
+ ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) {
+ continue;
+ }
if ((type != ALLOW && type != DENY) ||
- (iflags & ACE_INHERIT_ONLY_ACE)) {
- if (iflags)
+ (inherit_flags & ACE_INHERIT_ONLY_ACE)) {
+ if (inherit_flags)
aclp->z_hints |= ZFS_INHERIT_ACE;
switch (type) {
case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -1540,116 +1461,58 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
aclp->z_hints |= ZFS_ACL_OBJ_ACE;
break;
}
- goto nextace;
- }
-
- /*
- * Need to split ace into two?
- */
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) &&
- (!(iflags & ACE_INHERIT_ONLY_ACE))) {
- zfs_acl_split_ace(aclp, acep);
- aclp->z_hints |= ZFS_INHERIT_ACE;
- goto nextace;
- }
-
- if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- (entry_type == OWNING_GROUP)) {
- access_mask &= ~OGE_CLEAR;
- aclp->z_ops.ace_mask_set(acep, access_mask);
- goto nextace;
} else {
- reuse_deny = B_TRUE;
- if (type == ALLOW) {
-
- /*
- * Check preceding ACE if any, to see
- * if we need to prepend a DENY ACE.
- * This is only applicable when the acl_mode
- * property == groupmask.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
- reuse_deny = zfs_reuse_deny(aclp, acep,
- prevacep);
-
- if (!reuse_deny) {
- prevacep =
- zfs_acl_prepend_deny(uid,
- aclp, acep, mode);
- } else {
- zfs_acl_prepend_fixup(
- aclp, prevacep,
- acep, mode, uid);
- }
- zfs_fixup_group_entries(aclp, acep,
- prevacep, mode);
- }
- }
- }
-nextace:
- prevacep = acep;
- }
-
- /*
- * Check out last six aces, if we have six.
- */
- if (aclp->z_acl_count >= 6) {
- if (zfs_have_canonical_six(aclp)) {
- need_canonical_six = 0;
+ /*
+ * Limit permissions to be no greater than
+ * group permissions
+ */
+ if (type == ALLOW && zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) {
+ if (!(mode & S_IRGRP))
+ access_mask &= ~ACE_READ_DATA;
+ if (!(mode & S_IWGRP))
+ access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ if (!(mode & S_IXGRP))
+ access_mask &= ~ACE_EXECUTE;
+ access_mask &=
+ ~(ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS);
+ }
}
- }
-
- if (need_canonical_six) {
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
- void *zacep;
- zfs_acl_node_t *aclnode =
- zfs_acl_node_alloc(abstract_size * 6);
-
- aclnode->z_size = abstract_size * 6;
- aclnode->z_ace_count = 6;
- aclp->z_acl_bytes += aclnode->z_size;
- list_insert_tail(&aclp->z_acl, aclnode);
-
- zacep = aclnode->z_acldata;
-
- i = 0;
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- 0, DENY, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- DENY, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- ALLOW, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
- aclp->z_acl_count += 6;
- }
-
- zfs_acl_fixup_canonical_six(aclp, mode);
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops.ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
}
-int
+void
zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
{
- int error;
-
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
- *aclp = NULL;
- error = zfs_acl_node_read(zp, aclp, B_TRUE);
- if (error == 0) {
- (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
- zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
- }
- mutex_exit(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp);
mutex_exit(&zp->z_lock);
- return (error);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(*aclp);
}
/*
@@ -1691,8 +1554,8 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
uint64_t mode, boolean_t *need_chmod)
{
void *pacep;
- void *acep, *acep2;
- zfs_acl_node_t *aclnode, *aclnode2;
+ void *acep;
+ zfs_acl_node_t *aclnode;
zfs_acl_t *aclp = NULL;
uint64_t who;
uint32_t access_mask;
@@ -1714,7 +1577,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
*need_chmod = B_TRUE;
pacep = NULL;
aclp = zfs_acl_alloc(paclp->z_version);
- if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD)
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK)
return (aclp);
while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
&access_mask, &iflags, &type)) {
@@ -1743,11 +1606,11 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
OWNING_GROUP)) && (vreg || (vdir && (iflags &
ACE_DIRECTORY_INHERIT_ACE)))) {
*need_chmod = B_FALSE;
+ }
- if (!vdir && passthrough_x &&
- ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
- access_mask &= ~ACE_EXECUTE;
- }
+ if (!vdir && passthrough_x &&
+ ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
}
aclnode = zfs_acl_node_alloc(ace_size);
@@ -1765,6 +1628,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
&data2)) == data1sz);
bcopy(data1, data2, data2sz);
}
+
aclp->z_acl_count++;
aclnode->z_ace_count++;
aclp->z_acl_bytes += aclnode->z_size;
@@ -1783,38 +1647,17 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
ASSERT(vdir);
- newflags = aclp->z_ops.ace_flags_get(acep);
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
if ((iflags & (ACE_FILE_INHERIT_ACE |
- ACE_DIRECTORY_INHERIT_ACE)) !=
- ACE_FILE_INHERIT_ACE) {
- aclnode2 = zfs_acl_node_alloc(ace_size);
- list_insert_tail(&aclp->z_acl, aclnode2);
- acep2 = aclnode2->z_acldata;
- zfs_set_ace(aclp, acep2,
- access_mask, type, who,
- iflags|ACE_INHERITED_ACE);
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
newflags |= ACE_INHERIT_ONLY_ACE;
- aclp->z_ops.ace_flags_set(acep, newflags);
- newflags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep2,
+ aclp->z_ops.ace_flags_set(acep,
newflags|ACE_INHERITED_ACE);
-
- /*
- * Copy special opaque data if any
- */
- if ((data1sz = aclp->z_ops.ace_data(acep,
- &data1)) != 0) {
- VERIFY((data2sz =
- aclp->z_ops.ace_data(acep2,
- &data2)) == data1sz);
- bcopy(data1, data2, data1sz);
- }
- aclp->z_acl_count++;
- aclnode2->z_ace_count++;
- aclp->z_acl_bytes += aclnode->z_size;
- zfs_restricted_update(zfsvfs, aclp, acep2);
} else {
- newflags |= ACE_INHERIT_ONLY_ACE;
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
aclp->z_ops.ace_flags_set(acep,
newflags|ACE_INHERITED_ACE);
}
@@ -1835,6 +1678,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
zfs_acl_t *paclp;
gid_t gid;
boolean_t need_chmod = B_TRUE;
+ boolean_t inherited = B_FALSE;
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1843,7 +1687,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
&acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
return (error);
-
/*
* Determine uid and gid.
*/
@@ -1865,21 +1708,36 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
(uint64_t)vap->va_gid,
cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
- if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
+ if (acl_ids->z_fgid != dzp->z_gid &&
!groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0)
acl_ids->z_fgid = 0;
}
if (acl_ids->z_fgid == 0) {
- if (dzp->z_phys->zp_mode & S_ISGID) {
- acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ if (dzp->z_mode & S_ISGID) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = dzp->z_gid;
gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
} else {
acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
ZFS_GROUP, cr, &acl_ids->z_fuidp);
#ifdef __FreeBSD__
- gid = acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ gid = acl_ids->z_fgid = dzp->z_gid;
#else
gid = crgetgid(cr);
#endif
@@ -1894,7 +1752,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
* file's new group, clear the file's set-GID bit.
*/
- if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
(vap->va_type == VDIR)) {
acl_ids->z_mode |= S_ISGID;
} else {
@@ -1904,28 +1762,38 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
}
if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
- (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
- !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
- mutex_enter(&dzp->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
- mutex_exit(&dzp->z_acl_lock);
+ (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
} else {
acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
mutex_exit(&dzp->z_lock);
+ mutex_exit(&dzp->z_acl_lock);
if (need_chmod) {
- acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+ acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
ZFS_ACL_AUTO_INHERIT : 0;
- zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
- acl_ids->z_mode, acl_ids->z_aclp);
+ zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp);
}
}
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
return (0);
}
@@ -1946,8 +1814,8 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
boolean_t
zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
{
- return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
- zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+ return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+ zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
}
/*
@@ -1965,15 +1833,15 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
- if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
- return (error);
-
if (mask == 0)
return (ENOSYS);
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
+ return (error);
+
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1982,8 +1850,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
/*
* Scan ACL to determine number of ACEs
*/
- if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
- !(mask & VSA_ACE_ALLTYPES)) {
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
void *zacep = NULL;
uint64_t who;
uint32_t access_mask;
@@ -2004,7 +1871,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
}
vsecp->vsa_aclcnt = count;
} else
- count = aclp->z_acl_count;
+ count = (int)aclp->z_acl_count;
if (mask & VSA_ACECNT) {
vsecp->vsa_aclcnt = count;
@@ -2038,11 +1905,11 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
}
if (mask & VSA_ACE_ACLFLAGS) {
vsecp->vsa_aclflags = 0;
- if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
vsecp->vsa_aclflags |= ACL_DEFAULTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
vsecp->vsa_aclflags |= ACL_PROTECTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
}
@@ -2120,11 +1987,12 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
zfs_acl_t *aclp;
zfs_fuid_info_t *fuidp = NULL;
boolean_t fuid_dirtied;
+ uint64_t acl_obj;
if (mask == 0)
return (ENOSYS);
- if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
+ if (zp->z_pflags & ZFS_IMMUTABLE)
return (EPERM);
if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
@@ -2140,37 +2008,41 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
* existing flags.
*/
if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
- aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
}
top:
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL? */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- zp->z_phys->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
} else {
- dmu_tx_hold_write(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, aclp->z_acl_bytes);
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
}
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
}
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
mutex_exit(&zp->z_acl_lock);
@@ -2188,20 +2060,20 @@ top:
error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
if (fuidp)
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
done:
- mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
return (error);
}
@@ -2226,15 +2098,15 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
*/
if ((v4_mode & WRITE_MASK_DATA) &&
(((ZTOV(zp)->v_type != VDIR) &&
- (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
(ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
+ (zp->z_pflags & ZFS_IMMUTABLE)))) {
return (EPERM);
}
#ifdef sun
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+ (zp->z_pflags & ZFS_NOUNLINK)) {
return (EPERM);
}
#else
@@ -2244,13 +2116,13 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
* handled in zfs_zaccess_delete().
*/
if ((v4_mode & ACE_DELETE) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+ (zp->z_pflags & ZFS_NOUNLINK)) {
return (EPERM);
}
#endif
if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
return (EACCES);
}
@@ -2297,19 +2169,21 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
- uid_t fowner;
uid_t gowner;
+ uid_t fowner;
zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
}
+ ASSERT(zp->z_acl_cached);
+
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
uint32_t mask_matched;
@@ -2409,18 +2283,10 @@ zfs_has_access(znode_t *zp, cred_t *cr)
uint32_t have = ACE_ALL_PERMS;
if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
- uid_t owner;
-
- owner = zfs_fuid_map_id(zp->z_zfsvfs,
- zp->z_phys->zp_uid, cr, ZFS_OWNER);
+ uid_t owner;
- return (
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
- secpolicy_vnode_chown(ZTOV(zp), cr, owner) == 0 ||
- secpolicy_vnode_setdac(ZTOV(zp), cr, owner) == 0 ||
- secpolicy_vnode_remove(ZTOV(zp), cr) == 0);
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
}
return (B_TRUE);
}
@@ -2478,38 +2344,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
boolean_t owner = B_FALSE;
boolean_t groupmbr = B_FALSE;
boolean_t is_attr;
- uid_t fowner;
- uid_t gowner;
uid_t uid = crgetuid(cr);
int error;
- if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
- is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
(ZTOV(zdp)->v_type == VDIR));
if (is_attr)
goto slow;
+
mutex_enter(&zdp->z_acl_lock);
- if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
- if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
- FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
mutex_exit(&zdp->z_acl_lock);
goto slow;
}
- fowner = (uid_t)zdp->z_phys->zp_uid;
- gowner = (uid_t)zdp->z_phys->zp_gid;
-
- if (uid == fowner) {
+ if (uid == zdp->z_uid) {
owner = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXUSR) {
+ if (zdp->z_mode & S_IXUSR) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2517,9 +2378,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
goto slow;
}
}
- if (groupmember(gowner, cr)) {
+ if (groupmember(zdp->z_gid, cr)) {
groupmbr = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXGRP) {
+ if (zdp->z_mode & S_IXGRP) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2528,7 +2389,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
}
}
if (!owner && !groupmbr) {
- if (zdp->z_phys->zp_mode & S_IXOTH) {
+ if (zdp->z_mode & S_IXOTH) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
@@ -2545,8 +2406,9 @@ slow:
}
/*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
+ * Determine whether Access should be granted/denied.
+ * The least priv subsytem is always consulted as a basic privilege
+ * can define any form of access.
*/
int
zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
@@ -2554,13 +2416,13 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
uint32_t working_mode;
int error;
int is_attr;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
boolean_t check_privs;
znode_t *xzp;
znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
- is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
- (ZTOV(zp)->v_type == VDIR));
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
#ifdef __FreeBSD__
/*
@@ -2568,15 +2430,22 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
* Note that not checking them is not just an optimization - without
* this shortcut, EA operations may bogusly fail with EACCES.
*/
- if (zp->z_phys->zp_flags & ZFS_XATTR)
+ if (zp->z_pflags & ZFS_XATTR)
return (0);
#else
/*
* If attribute then validate against base file
*/
if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
if ((error = zfs_zget(zp->z_zfsvfs,
- zp->z_phys->zp_parent, &xzp)) != 0) {
+ parent, &xzp)) != 0) {
return (error);
}
@@ -2598,11 +2467,36 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
#endif
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+ * in needed_bits. Map the bits mapped by working_mode (currently
+ * missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= VEXEC;
+
if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
&check_privs, skipaclchk, cr)) == 0) {
if (is_attr)
VN_RELE(ZTOV(xzp));
- return (0);
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits));
}
if (error && !check_privs) {
@@ -2616,12 +2510,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
if (error && check_privs) {
- uid_t owner;
mode_t checkmode = 0;
- owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
- ZFS_OWNER);
-
/*
* First check for implicit owner permission on
* read_acl/read_attributes
@@ -2643,9 +2533,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
if (working_mode & ACE_EXECUTE)
checkmode |= VEXEC;
- if (checkmode)
- error = secpolicy_vnode_access(cr, ZTOV(check_zp),
- owner, checkmode);
+ error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
+ needed_bits & ~checkmode, needed_bits);
if (error == 0 && (working_mode & ACE_WRITE_OWNER))
error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
@@ -2668,8 +2557,12 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
error = EACCES;
}
}
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits);
}
+
if (is_attr)
VN_RELE(ZTOV(xzp));
@@ -2699,15 +2592,15 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
static int
zfs_delete_final_check(znode_t *zp, znode_t *dzp,
- mode_t missing_perms, cred_t *cr)
+ mode_t available_perms, cred_t *cr)
{
int error;
uid_t downer;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
- error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
+ error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+ downer, available_perms, VWRITE|VEXEC);
if (error == 0)
error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -2756,7 +2649,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
uint32_t dzp_working_mode = 0;
uint32_t zp_working_mode = 0;
int dzp_error, zp_error;
- mode_t missing_perms;
+ mode_t available_perms;
boolean_t dzpcheck_privs = B_TRUE;
boolean_t zpcheck_privs = B_TRUE;
@@ -2774,7 +2667,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
* to determine what was found.
*/
- if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
return (EPERM);
/*
@@ -2817,23 +2710,20 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
* only need to see if we have write/execute on directory.
*/
- if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
- return (zfs_sticky_remove_access(dzp, zp, cr));
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
- if (!dzpcheck_privs)
+ if (dzp_error != 0 && !dzpcheck_privs)
return (dzp_error);
/*
* Fourth row
*/
- missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
- missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
-
- ASSERT(missing_perms);
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
- return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
}
@@ -2844,7 +2734,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
int add_perm;
int error;
- if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
add_perm = (ZTOV(szp)->v_type == VDIR) ?
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
index cd36696f9500..acf632bdbeff 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,6 +27,7 @@
#include <sys/vfs.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
#include <sys/zfs_acl.h>
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index 48c3ebf78a58..7372ee74d3ea 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -110,17 +110,41 @@ snapentry_compare(const void *a, const void *b)
return (0);
}
+#ifdef sun
+vnodeops_t *zfsctl_ops_root;
+vnodeops_t *zfsctl_ops_snapdir;
+vnodeops_t *zfsctl_ops_snapshot;
+vnodeops_t *zfsctl_ops_shares;
+vnodeops_t *zfsctl_ops_shares_dir;
+
+static const fs_operation_def_t zfsctl_tops_root[];
+static const fs_operation_def_t zfsctl_tops_snapdir[];
+static const fs_operation_def_t zfsctl_tops_snapshot[];
+static const fs_operation_def_t zfsctl_tops_shares[];
+#else /* !sun */
static struct vop_vector zfsctl_ops_root;
static struct vop_vector zfsctl_ops_snapdir;
static struct vop_vector zfsctl_ops_snapshot;
static struct vop_vector zfsctl_ops_shares;
static struct vop_vector zfsctl_ops_shares_dir;
+#endif /* !sun */
static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
static vnode_t *zfsctl_mknode_shares(vnode_t *);
static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
+#ifdef sun
+static gfs_opsvec_t zfsctl_opsvec[] = {
+ { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
+ { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
+ { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+ { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
+ { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
+ { NULL }
+};
+#endif /* sun */
+
/*
* Root directory elements. We only have two entries
* snapshot and shares.
@@ -144,11 +168,35 @@ static gfs_dirent_t zfsctl_root_entries[] = {
void
zfsctl_init(void)
{
+#ifdef sun
+ VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+#endif
}
void
zfsctl_fini(void)
{
+#ifdef sun
+ /*
+ * Remove vfsctl vnode ops
+ */
+ if (zfsctl_ops_root)
+ vn_freevnodeops(zfsctl_ops_root);
+ if (zfsctl_ops_snapdir)
+ vn_freevnodeops(zfsctl_ops_snapdir);
+ if (zfsctl_ops_snapshot)
+ vn_freevnodeops(zfsctl_ops_snapshot);
+ if (zfsctl_ops_shares)
+ vn_freevnodeops(zfsctl_ops_shares);
+ if (zfsctl_ops_shares_dir)
+ vn_freevnodeops(zfsctl_ops_shares_dir);
+
+ zfsctl_ops_root = NULL;
+ zfsctl_ops_snapdir = NULL;
+ zfsctl_ops_snapshot = NULL;
+ zfsctl_ops_shares = NULL;
+ zfsctl_ops_shares_dir = NULL;
+#endif /* sun */
}
boolean_t
@@ -191,6 +239,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
{
vnode_t *vp, *rvp;
zfsctl_node_t *zcp;
+ uint64_t crtime[2];
ASSERT(zfsvfs->z_ctldir == NULL);
@@ -201,7 +250,9 @@ zfsctl_create(zfsvfs_t *zfsvfs)
zcp->zc_id = ZFSCTL_INO_ROOT;
VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
- ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof (crtime)));
+ ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
VN_URELE(rvp);
/*
@@ -273,12 +324,12 @@ static int
zfsctl_common_access(ap)
struct vop_access_args /* {
struct vnode *a_vp;
- int a_accmode;
+ accmode_t a_accmode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
- int mode = ap->a_accmode;
+ accmode_t accmode = ap->a_accmode;
#ifdef TODO
if (flags & V_ACE_MASK) {
@@ -286,8 +337,8 @@ zfsctl_common_access(ap)
return (EACCES);
} else {
#endif
- if (mode & VWRITE)
- return (EACCES);
+ if (accmode & VWRITE)
+ return (EACCES);
#ifdef TODO
}
#endif
@@ -301,14 +352,13 @@ zfsctl_common_access(ap)
static void
zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
{
- zfsctl_node_t *zcp = vp->v_data;
timestruc_t now;
vap->va_uid = 0;
vap->va_gid = 0;
vap->va_rdev = 0;
/*
- * We are a purly virtual object, so we have no
+ * We are a purely virtual object, so we have no
* blocksize or allocated blocks.
*/
vap->va_blksize = 0;
@@ -323,7 +373,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
*/
gethrestime(&now);
vap->va_atime = now;
- vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
/* FreeBSD: Reset chflags(2) flags. */
vap->va_flags = 0;
}
@@ -363,6 +412,7 @@ zfsctl_common_fid(ap)
return (0);
}
+
/*ARGSUSED*/
static int
zfsctl_shares_fid(ap)
@@ -436,16 +486,18 @@ zfsctl_root_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
struct vnode *vp = ap->a_vp;
struct vattr *vap = ap->a_vap;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_node_t *zcp = vp->v_data;
ZFS_ENTER(zfsvfs);
vap->va_nodeid = ZFSCTL_INO_ROOT;
vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+ vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+ vap->va_birthtime = vap->va_ctime;
zfsctl_common_getattr(vp, vap);
ZFS_EXIT(zfsvfs);
@@ -453,6 +505,40 @@ zfsctl_root_getattr(ap)
return (0);
}
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ /*
+ * No extended attributes allowed under .zfs
+ */
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(nm, "..") == 0) {
+ err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
+ if (err == 0)
+ VOP_UNLOCK(*vpp, 0);
+ } else {
+ err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (err);
+}
+
#ifdef sun
static int
zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
@@ -493,40 +579,6 @@ static const fs_operation_def_t zfsctl_tops_root[] = {
*/
/* ARGSUSED */
int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(nm, "..") == 0) {
- err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
- if (err == 0)
- VOP_UNLOCK(*vpp, 0);
- } else {
- err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
- cr, ct, direntflags, realpnp);
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (err);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
zfsctl_freebsd_root_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
@@ -551,7 +603,6 @@ zfsctl_freebsd_root_lookup(ap)
err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
return (err);
}
@@ -566,6 +617,9 @@ static struct vop_vector zfsctl_ops_root = {
.vop_lookup = zfsctl_freebsd_root_lookup,
.vop_inactive = gfs_vop_inactive,
.vop_reclaim = zfsctl_common_reclaim,
+#ifdef TODO
+ .vop_pathconf = zfsctl_pathconf,
+#endif
.vop_fid = zfsctl_common_fid,
};
@@ -596,10 +650,32 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
if ((error = vn_vfswlock(svp)) != 0)
return (error);
+#ifdef sun
+ VN_HOLD(svp);
+ error = dounmount(vn_mountedvfs(svp), fflags, cr);
+ if (error) {
+ VN_RELE(svp);
+ return (error);
+ }
+
+ /*
+ * We can't use VN_RELE(), as that will try to invoke
+ * zfsctl_snapdir_inactive(), which would cause us to destroy
+ * the sd_lock mutex held by our caller.
+ */
+ ASSERT(svp->v_count == 1);
+ gfs_vop_inactive(svp, cr, NULL);
+
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ return (0);
+#else /* !sun */
return (dounmount(vn_mountedvfs(svp), fflags, curthread));
+#endif /* !sun */
}
-#if 0
+#ifdef sun
static void
zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
{
@@ -639,7 +715,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath);
+ vfs_setmntpoint(vfsp, newpath, 0);
pathref = vfs_getresource(vfsp);
(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
@@ -648,13 +724,13 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setresource(vfsp, newpath);
+ vfs_setresource(vfsp, newpath, 0);
vfs_unlock(vfsp);
}
-#endif
+#endif /* sun */
-#if 0
+#ifdef sun
/*ARGSUSED*/
static int
zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
@@ -717,9 +793,9 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
return (err);
}
-#endif
+#endif /* sun */
-#if 0
+#ifdef sun
/* ARGSUSED */
static int
zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
@@ -769,7 +845,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
avl_insert(&sdp->sd_snaps, sep, where);
} else
- err = dmu_objset_destroy(snapname);
+ err = dmu_objset_destroy(snapname, B_FALSE);
} else {
err = ENOENT;
}
@@ -778,7 +854,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
return (err);
}
-#endif
+#endif /* sun */
/*
* This creates a snapshot under '.zfs/snapshot'.
@@ -806,7 +882,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
return (err);
if (err == 0) {
- err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
+ err = dmu_objset_snapshot(name, dirname, NULL, NULL,
+ B_FALSE, B_FALSE, -1);
if (err)
return (err);
err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@@ -951,8 +1028,7 @@ zfsctl_snapdir_lookup(ap)
*/
return (err == EILSEQ ? ENOENT : err);
}
- if (dmu_objset_open(snapname, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
+ if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
mutex_exit(&sdp->sd_lock);
/* Translate errors and add SAVENAME when needed. */
if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
@@ -972,7 +1048,7 @@ zfsctl_snapdir_lookup(ap)
VN_HOLD(*vpp);
avl_insert(&sdp->sd_snaps, sep, where);
- dmu_objset_close(snap);
+ dmu_objset_rele(snap, FTAG);
domount:
mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
@@ -1194,6 +1270,8 @@ zfsctl_shares_getattr(ap)
}
ZFS_EXIT(zfsvfs);
return (error);
+
+
}
/* ARGSUSED */
@@ -1203,11 +1281,10 @@ zfsctl_snapdir_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
- struct vnode *vp = ap->a_vp;
- struct vattr *vap = ap->a_vap;
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
zfsctl_snapdir_t *sdp = vp->v_data;
@@ -1215,6 +1292,8 @@ zfsctl_snapdir_getattr(ap)
zfsctl_common_getattr(vp, vap);
vap->va_nodeid = gfs_file_inode(vp);
vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+ vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_birthtime = vap->va_ctime;
ZFS_EXIT(zfsvfs);
return (0);
@@ -1251,6 +1330,38 @@ zfsctl_snapdir_inactive(ap)
return (0);
}
+#ifdef sun
+static const fs_operation_def_t zfsctl_tops_snapdir[] = {
+ { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
+ { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
+ { VOPNAME_IOCTL, { .error = fs_inval } },
+ { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
+ { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } },
+ { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } },
+ { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } },
+ { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } },
+ { VOPNAME_SEEK, { .vop_seek = fs_seek } },
+ { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } },
+ { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
+ { NULL }
+};
+
+static const fs_operation_def_t zfsctl_tops_shares[] = {
+ { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
+ { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
+ { VOPNAME_IOCTL, { .error = fs_inval } },
+ { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
+ { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } },
+ { VOPNAME_SEEK, { .vop_seek = fs_seek } },
+ { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
+ { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } },
+ { NULL }
+};
+#else /* !sun */
static struct vop_vector zfsctl_ops_snapdir = {
.vop_default = &default_vnodeops,
.vop_open = zfsctl_common_open,
@@ -1279,6 +1390,7 @@ static struct vop_vector zfsctl_ops_shares = {
.vop_reclaim = zfsctl_common_reclaim,
.vop_fid = zfsctl_shares_fid,
};
+#endif /* !sun */
/*
* pvp is the GFS vnode '.zfs/snapshot'.
@@ -1347,8 +1459,8 @@ zfsctl_snapshot_inactive(ap)
if (!locked)
mutex_exit(&sdp->sd_lock);
VN_RELE(dvp);
-end:
+end:
/*
* Dispose of the vnode for the snapshot mount point.
* This is safe to do because once this entry has been removed
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
new file mode 100644
index 000000000000..d0f411a99350
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ zfs_dbgmsg_t *zdm;
+
+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_destroy(&zfs_dbgmsgs_lock);
+ ASSERT3U(zfs_dbgmsg_size, ==, 0);
+}
+
+/*
+ * Print these messages by running:
+ * echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ zfs_dbgmsg_t *zdm;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx);
+ va_end(adx);
+
+ /*
+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+ * for the terminating null.
+ */
+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+ zdm->zdm_timestamp = gethrestime_sec();
+
+ va_start(adx, fmt);
+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index 3ac4741cffc9..bae9071c3c39 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -52,6 +51,8 @@
#include <sys/atomic.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/extdirent.h>
@@ -286,8 +287,10 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
* See if there's an object by this name; if so, put a hold on it.
*/
if (flag & ZXATTR) {
- zoid = dzp->z_phys->zp_xattr;
- error = (zoid == 0 ? ENOENT : 0);
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
} else {
if (update)
vp = dnlc_lookup(ZTOV(dzp), name);
@@ -379,25 +382,29 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
zfs_dirlock_t *dl;
znode_t *zp;
int error = 0;
+ uint64_t parent;
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
*vpp = ZTOV(dzp);
VN_HOLD(*vpp);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
- if (dzp->z_phys->zp_parent == dzp->z_id &&
- zfsvfs->z_parent != zfsvfs) {
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+ if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", vpp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
return (error);
}
rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ error = zfs_zget(zfsvfs, parent, &zp);
if (error == 0)
*vpp = ZTOV(zp);
rw_exit(&dzp->z_parent_lock);
@@ -445,7 +452,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_phys->zp_links, ==, 0);
+ ASSERT(zp->z_links == 0);
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -540,10 +547,12 @@ zfs_purgedir(znode_t *dzp)
(ZTOV(xzp)->v_type == VLNK));
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -576,15 +585,16 @@ zfs_rmnode(znode_t *zp)
znode_t *xzp = NULL;
dmu_tx_t *tx;
uint64_t acl_obj;
+ uint64_t xattr_obj;
int error;
- ASSERT(zp->z_phys->zp_links == 0);
+ ASSERT(zp->z_links == 0);
/*
* If this is an attribute directory, purge its contents.
*/
if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_XATTR)) {
+ (zp->z_pflags & ZFS_XATTR)) {
if (zfs_purgedir(zp) != 0) {
/*
* Not enough space to delete some xattrs.
@@ -613,12 +623,14 @@ zfs_rmnode(znode_t *zp)
* If the file has extended attributes, we're going to unlink
* the xattr dir.
*/
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT(error == 0);
}
- acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ acl_obj = zfs_external_acl(zp);
/*
* Set up the final transaction.
@@ -627,11 +639,13 @@ zfs_rmnode(znode_t *zp)
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
if (xzp) {
- dmu_tx_hold_bonus(tx, xzp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
if (acl_obj)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
/*
@@ -646,10 +660,12 @@ zfs_rmnode(znode_t *zp)
}
if (xzp) {
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ ASSERT(error == 0);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- xzp->z_phys->zp_links = 0; /* no more links to it */
+ xzp->z_links = 0; /* no more links to it */
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
@@ -667,11 +683,12 @@ out:
}
static uint64_t
-zfs_dirent(znode_t *zp)
+zfs_dirent(znode_t *zp, uint64_t mode)
{
uint64_t de = zp->z_id;
+
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
- de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+ de |= IFTODT(mode) << 60;
return (de);
}
@@ -682,12 +699,15 @@ int
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
uint64_t value;
int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
if (!(flag & ZRENAMING)) {
@@ -696,22 +716,47 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
mutex_exit(&zp->z_lock);
return (ENOENT);
}
- zp->z_phys->zp_links++;
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime, B_TRUE);
}
- zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
- if (!(flag & ZNEW))
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
mutex_exit(&zp->z_lock);
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size++; /* one dirent added */
- dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
- value = zfs_dirent(zp);
+ value = zfs_dirent(zp, zp->z_mode);
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
8, 1, &value, tx);
ASSERT(error == 0);
@@ -721,6 +766,30 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
return (0);
}
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (zp->z_zfsvfs->z_norm) {
+ if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
+ (flag & ZCIEXACT)) ||
+ ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(flag & ZCILOOK)))
+ error = zap_remove_norm(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, MT_EXACT, tx);
+ else
+ error = zap_remove_norm(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, MT_FIRST, tx);
+ } else {
+ error = zap_remove(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, tx);
+ }
+
+ return (error);
+}
+
/*
* Unlink zp from dl, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
@@ -733,16 +802,18 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
boolean_t *unlinkedp)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
dnlc_remove(ZTOV(dzp), dl->dl_name);
if (!(flag & ZRENAMING)) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
return (EBUSY);
@@ -752,51 +823,74 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
}
mutex_enter(&zp->z_lock);
- if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
return (ENOTEMPTY);
}
- if (zp->z_phys->zp_links <= zp_is_dir) {
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0) {
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ return (error);
+ }
+
+ if (zp->z_links <= zp_is_dir) {
zfs_panic_recover("zfs: link count on vnode %p is %u, "
"should be at least %u", zp->z_vnode,
- (int)zp->z_phys->zp_links,
+ (int)zp->z_links,
zp_is_dir + 1);
- zp->z_phys->zp_links = zp_is_dir + 1;
+ zp->z_links = zp_is_dir + 1;
}
- if (--zp->z_phys->zp_links == zp_is_dir) {
+ if (--zp->z_links == zp_is_dir) {
zp->z_unlinked = B_TRUE;
- zp->z_phys->zp_links = 0;
+ zp->z_links = 0;
unlinked = B_TRUE;
} else {
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
}
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT(error == 0);
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
+ } else {
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0)
+ return (error);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size--; /* one dirent removed */
- dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- if (zp->z_zfsvfs->z_norm) {
- if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
- (flag & ZCIEXACT)) ||
- ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(flag & ZCILOOK)))
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_EXACT, tx);
- else
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_FIRST, tx);
- } else {
- error = zap_remove(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, tx);
- }
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -814,7 +908,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
boolean_t
zfs_dirempty(znode_t *dzp)
{
- return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+ return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
}
int
@@ -826,6 +920,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
int error;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t parent;
*xvpp = NULL;
@@ -846,28 +941,39 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
return (EDQUOT);
}
+top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
- if (error == ERESTART)
+ if (error == ERESTART) {
dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
return (error);
}
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- ASSERT(xzp->z_phys->zp_parent == zp->z_id);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_xattr = xzp->z_id;
+#ifdef DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
xzp, "", NULL, acl_ids.z_fuidp, vap);
@@ -912,7 +1018,6 @@ top:
return (0);
}
- ASSERT(zp->z_phys->zp_xattr == 0);
if (!(flags & CREATE_XATTR_DIR)) {
zfs_dirent_unlock(dl);
@@ -980,11 +1085,11 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
if (zdp->z_zfsvfs->z_replay)
return (0);
- if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+ if ((zdp->z_mode & S_ISVTX) == 0)
return (0);
- downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
- fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
(ZTOV(zp)->v_type == VREG &&
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
index 4b27ec324c9c..0b4812666442 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -28,16 +28,12 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/fm/fs/zfs.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>
-
-#ifdef _KERNEL
-/* Including sys/bus.h is just too hard, so I declare what I need here. */
-extern void devctl_notify(const char *__system, const char *__subsystem,
- const char *__type, const char *__data);
-#endif
+#include <sys/sysevent.h>
/*
* This general routine is responsible for generating all the different ZFS
@@ -92,21 +88,32 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
* this pointer is set to NULL, and no ereport will be generated (since it
* doesn't actually correspond to any particular device or piece of data,
* and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs. Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO. When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
*/
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+#ifdef _KERNEL
+static void
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+ const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
uint64_t stateoroffset, uint64_t size)
{
-#ifdef _KERNEL
- char buf[1024];
- struct sbuf sb;
- struct timespec ts;
- int error;
+ nvlist_t *ereport, *detector;
+
+ uint64_t ena;
+ char class[64];
/*
- * If we are doing a spa_tryimport(), ignore errors.
+ * If we are doing a spa_tryimport() or in recovery mode,
+ * ignore errors.
*/
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER)
return;
/*
@@ -114,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* failed, don't bother logging any new ereports - we're just going to
* get the same diagnosis anyway.
*/
- if (spa->spa_load_state != SPA_LOAD_NONE &&
+ if (spa_load_state(spa) != SPA_LOAD_NONE &&
spa->spa_last_open_failed)
return;
@@ -153,9 +160,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* not yet been asynchronously placed into the REMOVED
* state.
*/
- if (zio->io_vd == vd &&
- !vdev_accessible(vd, zio) &&
- strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+ if (zio->io_vd == vd && !vdev_accessible(vd, zio))
return;
/*
@@ -169,51 +174,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
return;
}
}
- nanotime(&ts);
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+ /*
+ * For probe failure, we want to avoid posting ereports if we've
+ * already removed the device in the meantime.
+ */
+ if (vd != NULL &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+ return;
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return;
+ }
/*
* Serialize ereport generation
*/
mutex_enter(&spa->spa_errlist_lock);
-#if 0
/*
* Determine the ENA to use for this event. If we are in a loading
* state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
* a root zio-wide ENA. Otherwise, simply use a unique ENA.
*/
- if (spa->spa_load_state != SPA_LOAD_NONE) {
-#if 0
+ if (spa_load_state(spa) != SPA_LOAD_NONE) {
if (spa->spa_ena == 0)
spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
-#endif
ena = spa->spa_ena;
} else if (zio != NULL && zio->io_logical != NULL) {
-#if 0
if (zio->io_logical->io_ena == 0)
zio->io_logical->io_ena =
fm_ena_generate(0, FM_ENA_FMT1);
-#endif
ena = zio->io_logical->io_ena;
} else {
-#if 0
ena = fm_ena_generate(0, FM_ENA_FMT1);
-#else
- ena = 0;
-#endif
}
-#endif
/*
* Construct the full class, detector, and other standard FMA fields.
*/
- sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
- sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
- sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
/*
* Construct the per-ereport payload, depending on which parameters are
@@ -223,51 +234,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
/*
* Generic payload members common to all ereports.
*/
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
- sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
- spa->spa_load_state);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+ DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ spa_load_state(spa), NULL);
if (spa != NULL) {
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ DATA_TYPE_STRING,
spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
FM_EREPORT_FAILMODE_WAIT :
spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
- FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC);
+ FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+ NULL);
}
if (vd != NULL) {
vdev_t *pvd = vd->vdev_parent;
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
- vd->vdev_ops->vdev_op_type);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
if (vd->vdev_path != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
if (vd->vdev_devid != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
if (vd->vdev_fru != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+ DATA_TYPE_STRING, vd->vdev_fru, NULL);
if (pvd != NULL) {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
- pvd->vdev_ops->vdev_op_type);
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
if (pvd->vdev_path)
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
- pvd->vdev_path);
+ DATA_TYPE_STRING, pvd->vdev_path, NULL);
if (pvd->vdev_devid)
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
- pvd->vdev_devid);
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
}
}
@@ -275,8 +292,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
/*
* Payload common to all I/Os.
*/
- sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
- zio->io_error);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
/*
* If the 'size' parameter is non-zero, it indicates this is a
@@ -284,52 +301,500 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* provided for us, instead of within the zio_t.
*/
if (vd != NULL) {
- if (size) {
- sbuf_printf(&sb, " %s=%ju",
+ if (size)
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- stateoroffset);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
- } else {
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size, NULL);
+ else
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- zio->io_offset);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64, zio->io_offset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
- zio->io_size);
- }
+ DATA_TYPE_UINT64, zio->io_size, NULL);
}
/*
* Payload for I/Os with corresponding logical information.
*/
- if (zio->io_logical != NULL) {
- sbuf_printf(&sb, " %s=%ju",
+ if (zio->io_logical != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_objset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
- zio->io_logical->io_bookmark.zb_object);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_object,
FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
- zio->io_logical->io_bookmark.zb_level);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_INT64,
+ zio->io_logical->io_bookmark.zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
- zio->io_logical->io_bookmark.zb_blkid);
- }
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_blkid, NULL);
} else if (vd != NULL) {
/*
* If we have a vdev but no zio, this is a device fault, and the
* 'stateoroffset' parameter indicates the previous state of the
* vdev.
*/
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
- stateoroffset);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
}
+
mutex_exit(&spa->spa_errlist_lock);
- error = sbuf_finish(&sb);
- devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
- if (error != 0)
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
+ *ereport_out = ereport;
+ *detector_out = detector;
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
+
+#define MAX_RANGES 16
+
+typedef struct zfs_ecksum_info {
+ /* histograms of set and cleared bits by bit number in a 64-bit word */
+ uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+ uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+ /* inline arrays of bits set and cleared. */
+ uint64_t zei_bits_set[ZFM_MAX_INLINE];
+ uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+ /*
+ * for each range, the number of bits set and cleared. The Hamming
+ * distance between the good and bad buffers is the sum of them all.
+ */
+ uint32_t zei_range_sets[MAX_RANGES];
+ uint32_t zei_range_clears[MAX_RANGES];
+
+ struct zei_ranges {
+ uint32_t zr_start;
+ uint32_t zr_end;
+ } zei_ranges[MAX_RANGES];
+
+ size_t zei_range_count;
+ uint32_t zei_mingap;
+ uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+{
+ size_t i;
+ size_t bits = 0;
+ uint64_t value = BE_64(value_arg);
+
+ /* We store the bits in big-endian (largest-first) order */
+ for (i = 0; i < 64; i++) {
+ if (value & (1ull << i)) {
+ hist[63 - i]++;
+ ++bits;
+ }
+ }
+ /* update the count of bits changed */
+ *count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly. zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that. We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one. We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+shrink_ranges(zfs_ecksum_info_t *eip)
+{
+ uint32_t mingap = UINT32_MAX;
+ uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+ size_t idx, output;
+ size_t max = eip->zei_range_count;
+
+ struct zei_ranges *r = eip->zei_ranges;
+
+ ASSERT3U(eip->zei_range_count, >, 0);
+ ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+ output = idx = 0;
+ while (idx < max - 1) {
+ uint32_t start = r[idx].zr_start;
+ uint32_t end = r[idx].zr_end;
+
+ while (idx < max - 1) {
+ idx++;
+
+ uint32_t nstart = r[idx].zr_start;
+ uint32_t nend = r[idx].zr_end;
+
+ uint32_t gap = nstart - end;
+ if (gap < new_allowed_gap) {
+ end = nend;
+ continue;
+ }
+ if (gap < mingap)
+ mingap = gap;
+ break;
+ }
+ r[output].zr_start = start;
+ r[output].zr_end = end;
+ output++;
+ }
+ ASSERT3U(output, <, eip->zei_range_count);
+ eip->zei_range_count = output;
+ eip->zei_mingap = mingap;
+ eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+
+ if (count >= MAX_RANGES) {
+ shrink_ranges(eip);
+ count = eip->zei_range_count;
+ }
+ if (count == 0) {
+ eip->zei_mingap = UINT32_MAX;
+ eip->zei_allowed_mingap = 1;
+ } else {
+ int gap = start - r[count - 1].zr_end;
+
+ if (gap < eip->zei_allowed_mingap) {
+ r[count - 1].zr_end = end;
+ return;
+ }
+ if (gap < eip->zei_mingap)
+ eip->zei_mingap = gap;
+ }
+ r[count].zr_start = start;
+ r[count].zr_end = end;
+ eip->zei_range_count++;
+}
+
+static size_t
+range_total_size(zfs_ecksum_info_t *eip)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+ size_t result = 0;
+ size_t idx;
+
+ for (idx = 0; idx < count; idx++)
+ result += (r[idx].zr_end - r[idx].zr_start);
+
+ return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+ const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
+ boolean_t drop_if_identical)
+{
+ const uint64_t *good = (const uint64_t *)goodbuf;
+ const uint64_t *bad = (const uint64_t *)badbuf;
+
+ uint64_t allset = 0;
+ uint64_t allcleared = 0;
+
+ size_t nui64s = size / sizeof (uint64_t);
+
+ size_t inline_size;
+ int no_inline = 0;
+ size_t idx;
+ size_t range;
+
+ size_t offset = 0;
+ ssize_t start = -1;
+
+ zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+ /* don't do any annotation for injected checksum errors */
+ if (info != NULL && info->zbc_injected)
+ return (eip);
+
+ if (info != NULL && info->zbc_has_cksum) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_expected) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_expected,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_actual) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_actual,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+ DATA_TYPE_STRING,
+ info->zbc_checksum_name,
+ NULL);
+
+ if (info->zbc_byteswapped) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+ DATA_TYPE_BOOLEAN, 1,
+ NULL);
+ }
+ }
+
+ if (badbuf == NULL || goodbuf == NULL)
+ return (eip);
+
+ ASSERT3U(nui64s, <=, UINT16_MAX);
+ ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, <=, UINT32_MAX);
+
+ /* build up the range list by comparing the two buffers. */
+ for (idx = 0; idx < nui64s; idx++) {
+ if (good[idx] == bad[idx]) {
+ if (start == -1)
+ continue;
+
+ add_range(eip, start, idx);
+ start = -1;
+ } else {
+ if (start != -1)
+ continue;
+
+ start = idx;
+ }
+ }
+ if (start != -1)
+ add_range(eip, start, idx);
+
+ /* See if it will fit in our inline buffers */
+ inline_size = range_total_size(eip);
+ if (inline_size > ZFM_MAX_INLINE)
+ no_inline = 1;
+
+ /*
+ * If there is no change and we want to drop if the buffers are
+ * identical, do so.
+ */
+ if (inline_size == 0 && drop_if_identical) {
+ kmem_free(eip, sizeof (*eip));
+ return (NULL);
+ }
+
+ /*
+ * Now walk through the ranges, filling in the details of the
+ * differences. Also convert our uint64_t-array offsets to byte
+ * offsets.
+ */
+ for (range = 0; range < eip->zei_range_count; range++) {
+ size_t start = eip->zei_ranges[range].zr_start;
+ size_t end = eip->zei_ranges[range].zr_end;
+
+ for (idx = start; idx < end; idx++) {
+ uint64_t set, cleared;
+
+ // bits set in bad, but not in good
+ set = ((~good[idx]) & bad[idx]);
+ // bits set in good, but not in bad
+ cleared = (good[idx] & (~bad[idx]));
+
+ allset |= set;
+ allcleared |= cleared;
+
+ if (!no_inline) {
+ ASSERT3U(offset, <, inline_size);
+ eip->zei_bits_set[offset] = set;
+ eip->zei_bits_cleared[offset] = cleared;
+ offset++;
+ }
+
+ update_histogram(set, eip->zei_histogram_set,
+ &eip->zei_range_sets[range]);
+ update_histogram(cleared, eip->zei_histogram_cleared,
+ &eip->zei_range_clears[range]);
+ }
+
+ /* convert to byte offsets */
+ eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
+ eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
+ }
+ eip->zei_allowed_mingap *= sizeof (uint64_t);
+ inline_size *= sizeof (uint64_t);
+
+ /* fill in ereport */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+ DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+ (uint32_t *)eip->zei_ranges,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+ DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+ NULL);
+
+ if (!no_inline) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_cleared,
+ NULL);
+ } else {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+ DATA_TYPE_UINT16_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+ DATA_TYPE_UINT16_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+ NULL);
+ }
+ return (eip);
+}
+#endif
+
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+
+ zfs_ereport_start(&ereport, &detector,
+ subclass, spa, vd, zio, stateoroffset, size);
+
+ if (ereport == NULL)
+ return;
+
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+void
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+ zio_bad_cksum_t *info)
+{
+ zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+ if (zio->io_vsd != NULL)
+ zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+ else
+ zio_vsd_default_cksum_report(zio, report, arg);
+
+ /* copy the checksum failure information if it was provided */
+ if (info != NULL) {
+ report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+ bcopy(info, report->zcr_ckinfo, sizeof (*info));
+ }
+
+ report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_length = length;
+
+#ifdef _KERNEL
+ zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (report->zcr_ereport == NULL) {
+ report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+ kmem_free(report, sizeof (*report));
+ return;
+ }
+#endif
+
+ mutex_enter(&spa->spa_errlist_lock);
+ report->zcr_next = zio->io_logical->io_cksum_report;
+ zio->io_logical->io_cksum_report = report;
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+ zfs_ecksum_info_t *info = NULL;
+ info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+ good_data, bad_data, report->zcr_length, drop_if_identical);
+
+ if (info != NULL)
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
+ report->zcr_ereport = report->zcr_detector = NULL;
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+ if (rpt->zcr_ereport != NULL) {
+ fm_nvlist_destroy(rpt->zcr_ereport,
+ FM_NVA_FREE);
+ fm_nvlist_destroy(rpt->zcr_detector,
+ FM_NVA_FREE);
+ }
+#endif
+ rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+ if (rpt->zcr_ckinfo != NULL)
+ kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+ kmem_free(rpt, sizeof (*rpt));
+}
+
+void
+zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
+{
+#ifdef _KERNEL
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+#endif
+}
+
+void
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+ zfs_ecksum_info_t *info;
+
+ zfs_ereport_start(&ereport, &detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (ereport == NULL)
+ return;
+
+ info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+ B_FALSE);
+
+ if (info != NULL)
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
#endif
}
@@ -337,32 +802,28 @@ static void
zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
{
#ifdef _KERNEL
- char buf[1024];
+ nvlist_t *resource;
char class[64];
- struct sbuf sb;
- struct timespec ts;
- int error;
- nanotime(&ts);
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return;
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return;
- snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
ZFS_ERROR_CLASS, name);
- sbuf_printf(&sb, " %s=%d", FM_VERSION, FM_RSRC_VERSION);
- sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
+ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
if (vd)
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- error = sbuf_finish(&sb);
- ZFS_LOG(1, "%s", sbuf_data(&sb));
- devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
- if (error != 0)
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+ fm_ereport_post(resource, EVCH_SLEEP);
+
+ fm_nvlist_destroy(resource, FM_NVA_FREE);
#endif
}
@@ -388,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
{
zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
}
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
index 8090ec10f18a..5b54448aeedb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
-#include <sys/sunddi.h>
#include <sys/dmu.h>
#include <sys/avl.h>
#include <sys/zap.h>
@@ -377,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
- if (zfsvfs->z_fuid_obj)
+ if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
else
domain = nulldomain;
@@ -390,10 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
void
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
{
- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
- cr, ZFS_OWNER);
- *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
- cr, ZFS_GROUP);
+ *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
}
uid_t
@@ -418,9 +414,9 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
(void) kidmap_getgidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
}
-#else /* sun */
+#else /* !sun */
id = UID_NOBODY;
-#endif /* sun */
+#endif /* !sun */
return (id);
}
@@ -431,7 +427,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
* If ACL has multiple domains, then keep only one copy of each unique
* domain.
*/
-static void
+void
zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
uint64_t idx, uint64_t id, zfs_fuid_type_t type)
{
@@ -492,6 +488,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
/*
* Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
*/
uint64_t
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
@@ -506,24 +507,31 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
- if (type == ZFS_OWNER)
- id = crgetuid(cr);
- else
- id = crgetgid(cr);
+ ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+ if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+ if (IS_EPHEMERAL(id))
+ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
- if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
return ((uint64_t)id);
+ }
-#ifdef sun
- ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+ /*
+ * ksid is present and FUID is supported
+ */
+ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+ if (!IS_EPHEMERAL(id))
+ return ((uint64_t)id);
+
+ if (type == ZFS_GROUP)
+ id = ksid_getid(ksid);
- VERIFY(ksid != NULL);
rid = ksid_getrid(ksid);
domain = ksid_getdomain(ksid);
-#else /* sun */
- rid = UID_NOBODY;
- domain = nulldomain;
-#endif /* sun */
+
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
@@ -597,7 +605,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
};
domain = fuidp->z_domain_table[idx -1];
} else {
-#ifdef sun
if (type == ZFS_OWNER || type == ZFS_ACE_USER)
status = kidmap_getsidbyuid(crgetzone(cr), id,
&domain, &rid);
@@ -606,7 +613,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
&domain, &rid);
if (status != 0) {
-#endif /* sun */
/*
* When returning nobody we will need to
* make a dummy fuid table entry for logging
@@ -614,9 +620,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
*/
rid = UID_NOBODY;
domain = nulldomain;
-#ifdef sun
}
-#endif /* sun */
}
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
@@ -699,18 +703,16 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
#ifdef sun
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
ksidlist_t *ksidlist = crgetsidlist(cr);
-#endif /* sun */
+#endif /* !sun */
uid_t gid;
#ifdef sun
if (ksid && ksidlist) {
int i;
ksid_t *ksid_groups;
- ksidlist_t *ksidlist = crgetsidlist(cr);
uint32_t idx = FUID_INDEX(id);
uint32_t rid = FUID_RID(id);
- ASSERT(ksidlist);
ksid_groups = ksidlist->ksl_sids;
for (i = 0; i != ksidlist->ksl_nsid; i++) {
@@ -736,7 +738,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
}
}
}
-#endif /* sun */
+#endif /* !sun */
/*
* Not found in ksidlist, check posix groups
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 9a68adffbd6b..52300ee442e1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -47,7 +46,6 @@
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
#include <sys/dmu.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
@@ -65,14 +63,18 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
#include <sys/dmu_objset.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+#include "zfs_ioctl_compat.h"
-CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
+CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
static struct cdev *zfsdev;
@@ -105,17 +107,22 @@ static const char *userquota_perms[] = {
};
static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
-static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+ cred_t *cr);
+static int zfs_check_clearable(char *dataset, nvlist_t *props,
+ nvlist_t **errors);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *);
-int zfs_set_prop_nvlist(const char *, nvlist_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **);
+
+static void zfsdev_close(void *data);
/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
void
__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
- char buf[256];
+ char buf[512];
va_list adx;
/*
@@ -178,22 +185,15 @@ history_str_get(zfs_cmd_t *zc)
static boolean_t
zfs_is_bootfs(const char *name)
{
- spa_t *spa;
- boolean_t ret = B_FALSE;
-
- if (spa_open(name, &spa, FTAG) == 0) {
- if (spa->spa_bootfs) {
- objset_t *os;
+ objset_t *os;
- if (dmu_objset_open(name, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- ret = (dmu_objset_id(os) == spa->spa_bootfs);
- dmu_objset_close(os);
- }
- }
- spa_close(spa, FTAG);
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
+ boolean_t ret;
+ ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
+ dmu_objset_rele(os, FTAG);
+ return (ret);
}
- return (ret);
+ return (B_FALSE);
}
/*
@@ -227,13 +227,17 @@ zpl_earlier_version(const char *name, int version)
objset_t *os;
boolean_t rc = B_TRUE;
- if (dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
uint64_t zplversion;
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (B_TRUE);
+ }
+ /* XXX reading from non-owned objset */
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
rc = zplversion < version;
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
return (rc);
}
@@ -282,9 +286,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
}
static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
{
- uint64_t zoned;
int writable = 1;
/*
@@ -295,9 +298,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
!zone_dataset_visible(dataset, &writable))
return (ENOENT);
- if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
- return (ENOENT);
-
if (INGLOBALZONE(curthread)) {
/*
* If the fs is zoned, only root can access it from the
@@ -319,6 +319,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
return (0);
}
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+ return (ENOENT);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+ uint64_t zoned;
+
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ if (dsl_prop_get_ds(ds, "jailed", 8, 1, &zoned, NULL)) {
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ return (ENOENT);
+ }
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
int
zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
{
@@ -333,9 +359,126 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
return (error);
}
+int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+ const char *perm, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck_ds(name, ds, cr);
+ if (error == 0) {
+ error = secpolicy_zfs(cr);
+ if (error)
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ }
+ return (error);
+}
+
+#ifdef SECLABEL
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
static int
-zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
+zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
{
+ char ds_hexsl[MAXNAMELEN];
+ bslabel_t ds_sl, new_sl;
+ boolean_t new_default = FALSE;
+ uint64_t zoned;
+ int needed_priv = -1;
+ int error;
+
+ /* First get the existing dataset label. */
+ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (EPERM);
+
+ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+ new_default = TRUE;
+
+ /* The label must be translatable */
+ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+ return (EINVAL);
+
+ /*
+ * In a non-global zone, disallow attempts to set a label that
+ * doesn't match that of the zone; otherwise no other checks
+ * are needed.
+ */
+ if (!INGLOBALZONE(curproc)) {
+ if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+ return (EPERM);
+ return (0);
+ }
+
+ /*
+ * For global-zone datasets (i.e., those whose zoned property is
+ * "off", verify that the specified new label is valid for the
+ * global zone.
+ */
+ if (dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (EPERM);
+ if (!zoned) {
+ if (zfs_check_global_label(name, strval) != 0)
+ return (EPERM);
+ }
+
+ /*
+ * If the existing dataset label is nondefault, check if the
+ * dataset is mounted (label cannot be changed while mounted).
+ * Get the zfsvfs; if there isn't one, then the dataset isn't
+ * mounted (or isn't a dataset, doesn't exist, ...).
+ */
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+ objset_t *os;
+ static char *setsl_tag = "setsl_tag";
+
+ /*
+ * Try to own the dataset; abort if there is any error,
+ * (e.g., already mounted, in use, or other error).
+ */
+ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+ setsl_tag, &os);
+ if (error)
+ return (EPERM);
+
+ dmu_objset_disown(os, setsl_tag);
+
+ if (new_default) {
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ goto out_check;
+ }
+
+ if (hexstr_to_label(strval, &new_sl) != 0)
+ return (EPERM);
+
+ if (blstrictdom(&ds_sl, &new_sl))
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ else if (blstrictdom(&new_sl, &ds_sl))
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ } else {
+ /* dataset currently has a default label */
+ if (!new_default)
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ }
+
+out_check:
+ if (needed_priv != -1)
+ return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+ return (0);
+}
+#endif /* SECLABEL */
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+ cred_t *cr)
+{
+ char *strval;
+
/*
* Check permissions for special properties.
*/
@@ -357,16 +500,33 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
* quota on things *under* (ie. contained by)
* the thing they own.
*/
- if (dsl_prop_get_integer(name, "jailed", &zoned,
+ if (dsl_prop_get_integer(dsname, "jailed", &zoned,
setpoint))
return (EPERM);
- if (!zoned || strlen(name) <= strlen(setpoint))
+ if (!zoned || strlen(dsname) <= strlen(setpoint))
return (EPERM);
}
break;
+
+ case ZFS_PROP_MLSLABEL:
+#ifdef SECLABEL
+ if (!is_system_labeled())
+ return (EPERM);
+
+ if (nvpair_value_string(propval, &strval) == 0) {
+ int err;
+
+ err = zfs_set_slabel_policy(dsname, strval, CRED());
+ if (err != 0)
+ return (err);
+ }
+#else
+ return (EOPNOTSUPP);
+#endif
+ break;
}
- return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
+ return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
}
int
@@ -388,20 +548,45 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
int
zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
{
- int error;
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_ROLLBACK, cr);
- if (error == 0)
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr);
- return (error);
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_ROLLBACK, cr));
}
int
zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
{
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_SEND, cr));
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ char *cp;
+ int error;
+
+ /*
+ * Generate the current snapshot name from the given objsetid, then
+ * use that name for the secpolicy/zone checks.
+ */
+ cp = strchr(zc->zc_name, '@');
+ if (cp == NULL)
+ return (EINVAL);
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ dsl_dataset_name(ds, zc->zc_name);
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
}
static int
@@ -495,19 +680,34 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
}
/*
- * Must have sys_config privilege to check the iscsi permission
+ * Destroying snapshots with delegated permissions requires
+ * descendent mount and destroy permissions.
+ * Reassemble the full filesystem@snap name so dsl_deleg_access()
+ * can do the correct permission check.
+ *
+ * Since this routine is used when doing a recursive destroy of snapshots
+ * and destroying snapshots requires descendent permissions, a successfull
+ * check of the top level snapshot applies to snapshots of all descendent
+ * datasets as well.
*/
-/* ARGSUSED */
static int
-zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
{
- return (secpolicy_zfs(cr));
+ int error;
+ char *dsname;
+
+ dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+
+ error = zfs_secpolicy_destroy_perms(dsname, cr);
+
+ strfree(dsname);
+ return (error);
}
int
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[MAXNAMELEN];
int error;
if ((error = zfs_secpolicy_write_perms(from,
@@ -542,7 +742,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[MAXNAMELEN];
objset_t *clone;
int error;
@@ -551,20 +751,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
if (error)
return (error);
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
if (error == 0) {
dsl_dataset_t *pclone = NULL;
dsl_dir_t *dd;
- dd = clone->os->os_dsl_dataset->ds_dir;
+ dd = clone->os_dsl_dataset->ds_dir;
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
error = dsl_dataset_hold_obj(dd->dd_pool,
dd->dd_phys->dd_origin_obj, FTAG, &pclone);
rw_exit(&dd->dd_pool->dp_config_rwlock);
if (error) {
- dmu_objset_close(clone);
+ dmu_objset_rele(clone, FTAG);
return (error);
}
@@ -572,7 +771,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
ZFS_DELEG_PERM_MOUNT, cr);
dsl_dataset_name(pclone, parentname);
- dmu_objset_close(clone);
+ dmu_objset_rele(clone, FTAG);
dsl_dataset_rele(pclone, FTAG);
if (error == 0)
error = zfs_secpolicy_write_perms(parentname,
@@ -601,16 +800,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
int
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
{
- int error;
-
- if ((error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
- return (error);
-
- error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_MOUNT, cr);
-
- return (error);
+ return (zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_SNAPSHOT, cr));
}
static int
@@ -623,8 +814,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
{
- char parentname[MAXNAMELEN];
- int error;
+ char parentname[MAXNAMELEN];
+ int error;
if ((error = zfs_get_parent(zc->zc_name, parentname,
sizeof (parentname))) != 0)
@@ -673,19 +864,19 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
}
/*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
+ * Policy for object to name lookups.
*/
+/* ARGSUSED */
static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr)
{
- if (secpolicy_sys_config(cr, B_FALSE) != 0) {
- return (dsl_deleg_access(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr));
- }
+ int error;
- return (0);
+ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+ return (error);
}
/*
@@ -709,28 +900,11 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_USERPROP, cr));
} else {
- if (!zfs_prop_inheritable(prop))
- return (EINVAL);
- return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
+ return (zfs_secpolicy_setprop(zc->zc_name, prop,
+ NULL, cr));
}
}
-/*
- * Policy for dataset backup operations (sendbackup).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-static int
-zfs_secpolicy_operator(const char *dataset, cred_t *cr)
-{
- int writable = 1;
-
- if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable))
- return (ENOENT);
- if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
- return (EPERM);
- return (0);
-}
-
static int
zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
{
@@ -777,14 +951,56 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
{
- return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
+ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+ NULL, cr));
+}
+
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_HOLD, cr));
+}
+
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RELEASE, cr));
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr)
+{
+ /*
+ * A temporary snapshot is the same as a snapshot,
+ * hold, destroy and release all rolled into one.
+ * Delegated diff alone is sufficient that we allow this.
+ */
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_snapshot(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_hold(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_release(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_destroy(zc, cr);
+ return (error);
}
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
-get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
{
char *packed;
int error;
@@ -798,7 +1014,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
packed = kmem_alloc(size, KM_SLEEP);
- if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
kmem_free(packed, size);
return (error);
}
@@ -815,11 +1032,46 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
}
static int
+fit_error_list(zfs_cmd_t *zc, nvlist_t **errors)
+{
+ size_t size;
+
+ VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ nvpair_t *more_errors;
+ int n = 0;
+
+ if (zc->zc_nvlist_dst_size < 1024)
+ return (ENOMEM);
+
+ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0);
+ more_errors = nvlist_prev_nvpair(*errors, NULL);
+
+ do {
+ nvpair_t *pair = nvlist_prev_nvpair(*errors,
+ more_errors);
+ VERIFY(nvlist_remove_nvpair(*errors, pair) == 0);
+ n++;
+ VERIFY(nvlist_size(*errors, &size,
+ NV_ENCODE_NATIVE) == 0);
+ } while (size > zc->zc_nvlist_dst_size);
+
+ VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0);
+ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0);
+ ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+ ASSERT(size <= zc->zc_nvlist_dst_size);
+ }
+
+ return (0);
+}
+
+static int
put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
{
char *packed = NULL;
+ int error = 0;
size_t size;
- int error;
VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
@@ -837,8 +1089,9 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
packed = kmem_alloc(size, KM_SLEEP);
VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
KM_SLEEP) == 0);
- error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
- size);
+ if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size, zc->zc_iflags) != 0)
+ error = EFAULT;
kmem_free(packed, size);
}
@@ -847,25 +1100,28 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
}
static int
-getzfsvfs(const char *dsname, zfsvfs_t **zvp)
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
{
objset_t *os;
int error;
- error = dmu_objset_open(dsname, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os);
+ error = dmu_objset_hold(dsname, FTAG, &os);
if (error)
return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
- mutex_enter(&os->os->os_user_ptr_lock);
- *zvp = dmu_objset_get_user(os);
- if (*zvp) {
- VFS_HOLD((*zvp)->z_vfs);
+ mutex_enter(&os->os_user_ptr_lock);
+ *zfvp = dmu_objset_get_user(os);
+ if (*zfvp) {
+ VFS_HOLD((*zfvp)->z_vfs);
} else {
error = ESRCH;
}
- mutex_exit(&os->os->os_user_ptr_lock);
- dmu_objset_close(os);
+ mutex_exit(&os->os_user_ptr_lock);
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -874,22 +1130,22 @@ getzfsvfs(const char *dsname, zfsvfs_t **zvp)
* case its z_vfs will be NULL, and it will be opened as the owner.
*/
static int
-zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
{
int error = 0;
- int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0);
- if (getzfsvfs(name, zvp) != 0)
- error = zfsvfs_create(name, mode, zvp);
+ if (getzfsvfs(name, zfvp) != 0)
+ error = zfsvfs_create(name, zfvp);
if (error == 0) {
- rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag);
- if ((*zvp)->z_unmounted) {
+ rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+ RW_READER, tag);
+ if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
* thread should be just about to disassociate the
* objset from the zfsvfs.
*/
- rrw_exit(&(*zvp)->z_teardown_lock, tag);
+ rrw_exit(&(*zfvp)->z_teardown_lock, tag);
return (EBUSY);
}
}
@@ -904,7 +1160,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
if (zfsvfs->z_vfs) {
VFS_RELE(zfsvfs->z_vfs);
} else {
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
@@ -919,11 +1175,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
char *buf;
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config))
+ zc->zc_iflags, &config))
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -962,8 +1219,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
/*
* Set the remaining root properties
*/
- if (!error &&
- (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0)
+ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
+ ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
(void) spa_destroy(zc->zc_name);
if (buf != NULL)
@@ -984,22 +1241,25 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
int error;
zfs_log_history(zc);
error = spa_destroy(zc->zc_name);
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_import(zfs_cmd_t *zc)
{
- int error;
nvlist_t *config, *props = NULL;
uint64_t guid;
+ int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) != 0)
+ zc->zc_iflags, &config)) != 0)
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -1007,11 +1267,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != zc->zc_guid)
error = EINVAL;
- else if (zc->zc_cookie)
- error = spa_import_verbatim(zc->zc_name, config,
- props);
else
- error = spa_import(zc->zc_name, config, props);
+ error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+ if (zc->zc_nvlist_dst != 0) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ }
nvlist_free(config);
@@ -1030,6 +1294,8 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force, hardforce);
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
return (error);
}
@@ -1087,7 +1353,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &tryconfig)) != 0)
+ zc->zc_iflags, &tryconfig)) != 0)
return (error);
config = spa_tryimport(tryconfig);
@@ -1103,8 +1369,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_cookie scan func (pool_scan_func_t)
+ */
static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
@@ -1112,7 +1383,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
- error = spa_scrub(spa, zc->zc_cookie);
+ if (zc->zc_cookie == POOL_SCAN_NONE)
+ error = spa_scan_stop(spa);
+ else
+ error = spa_scan(spa, zc->zc_cookie);
spa_close(spa, FTAG);
@@ -1175,9 +1449,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
hist_buf = kmem_alloc(size, KM_SLEEP);
if ((error = spa_history_get(spa, &zc->zc_history_offset,
&zc->zc_history_len, hist_buf)) == 0) {
- error = xcopyout(hist_buf,
- (char *)(uintptr_t)zc->zc_history,
- zc->zc_history_len);
+ error = ddi_copyout(hist_buf,
+ (void *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len, zc->zc_iflags);
}
spa_close(spa, FTAG);
@@ -1196,18 +1470,59 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
return (0);
}
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_value name of object
+ */
static int
zfs_ioc_obj_to_path(zfs_cmd_t *zc)
{
- objset_t *osp;
+ objset_t *os;
int error;
- if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0)
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
- error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
+ error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
- dmu_objset_close(osp);
+ dmu_objset_rele(os, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_stat stats on object
+ * zc_value path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
+ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -1217,20 +1532,15 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
-#ifdef sun
nvlist_t *config, **l2cache, **spares;
uint_t nl2cache = 0, nspares = 0;
-#else
- nvlist_t *config;
-#endif
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config);
-#ifdef sun
+ zc->zc_iflags, &config);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache);
@@ -1247,11 +1557,11 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
*
* l2cache and spare devices are ok to be added to a rootpool.
*/
- if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
+ nvlist_free(config);
spa_close(spa, FTAG);
return (EDOM);
}
-#endif
if (error == 0) {
error = spa_vdev_add(spa, config);
@@ -1261,6 +1571,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_nvlist_conf nvlist of devices to remove
+ * zc_cookie to stop the remove?
+ */
static int
zfs_ioc_vdev_remove(zfs_cmd_t *zc)
{
@@ -1294,11 +1610,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
break;
case VDEV_STATE_FAULTED:
- error = vdev_fault(spa, zc->zc_guid);
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_DEGRADED:
- error = vdev_degrade(spa, zc->zc_guid);
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
default:
@@ -1321,7 +1645,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
return (error);
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) == 0) {
+ zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
@@ -1346,6 +1670,41 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
}
static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ nvlist_t *config, *props = NULL;
+ int error;
+ boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ spa_close(spa, FTAG);
+ nvlist_free(config);
+ return (error);
+ }
+
+ error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
{
spa_t *spa;
@@ -1379,6 +1738,35 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
return (error);
}
+static int
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_USER hold, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ * XXX reading with out owning
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent) {
+ if (dmu_objset_type(os) == DMU_OST_ZVOL)
+ VERIFY(zvol_get_stats(os, nv) == 0);
+ }
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
/*
* inputs:
* zc_name name of filesystem
@@ -1394,34 +1782,59 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os = NULL;
int error;
+
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (error);
+
+ error = zfs_ioc_objset_stats_impl(zc, os);
+
+ dmu_objset_rele(os, FTAG);
+
+ if (error == ENOMEM)
+ error = 0;
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst received property nvlist
+ * zc_nvlist_dst_size size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
nvlist_t *nv;
- if (error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (error);
- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+ /*
+ * Without this check, we would return local property values if the
+ * caller has not already received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ if (!dsl_prop_get_hasrecvd(os)) {
+ dmu_objset_rele(os, FTAG);
+ return (ENOTSUP);
+ }
if (zc->zc_nvlist_dst != 0 &&
- (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) {
- dmu_objset_stats(os, nv);
- /*
- * NB: zvol_get_stats() will read the objset contents,
- * which we aren't supposed to do with a
- * DS_MODE_USER hold, because it could be
- * inconsistent. So this is a bit of a workaround...
- */
- if (!zc->zc_objset_stats.dds_inconsistent) {
- if (dmu_objset_type(os) == DMU_OST_ZVOL)
- VERIFY(zvol_get_stats(os, nv) == 0);
- }
+ (error = dsl_prop_get_received(os, &nv)) == 0) {
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
- dmu_objset_close(os);
- if (error == ENOMEM)
- error = 0;
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -1456,8 +1869,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
objset_t *os;
int err;
- if (err = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+ /* XXX reading without owning */
+ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (err);
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
@@ -1482,11 +1895,11 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
} else {
err = ENOENT;
}
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (err);
}
-static boolean_t
+boolean_t
dataset_name_hidden(const char *name)
{
/*
@@ -1522,9 +1935,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
objset_t *os;
int error;
char *p;
+ size_t orig_len = strlen(zc->zc_name);
- if (error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) {
+top:
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
if (error == ENOENT)
error = ESRCH;
return (error);
@@ -1544,7 +1958,7 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
int len = sizeof (zc->zc_name) - (p - zc->zc_name);
while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
- (void) dmu_objset_prefetch(p, NULL);
+ (void) dmu_objset_prefetch(zc->zc_name, NULL);
}
do {
@@ -1553,12 +1967,22 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
NULL, &zc->zc_cookie);
if (error == ENOENT)
error = ESRCH;
- } while (error == 0 && dataset_name_hidden(zc->zc_name));
- dmu_objset_close(os);
+ } while (error == 0 && dataset_name_hidden(zc->zc_name) &&
+ !(zc->zc_iflags & FKIOCTL));
+ dmu_objset_rele(os, FTAG);
- if (error == 0)
+ /*
+ * If it's an internal dataset (ie. with a '$' in its name),
+ * don't try to get stats for it, otherwise we'll return ENOENT.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
+ if (error == ENOENT) {
+ /* We lost a race with destroy, get the next one. */
+ zc->zc_name[orig_len] = '\0';
+ goto top;
+ }
+ }
return (error);
}
@@ -1580,299 +2004,363 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
objset_t *os;
int error;
- error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
+top:
+ if (zc->zc_cookie == 0)
+ (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+ NULL, DS_FIND_SNAPSHOTS);
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error)
return (error == ENOENT ? ESRCH : error);
- if (zc->zc_cookie == 0) {
- (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
- NULL, DS_FIND_SNAPSHOTS);
- }
/*
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
*/
if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (ESRCH);
}
error = dmu_snapshot_list_next(os,
sizeof (zc->zc_name) - strlen(zc->zc_name),
- zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
- dmu_objset_close(os);
- if (error == 0)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
- else if (error == ENOENT)
+ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
+ NULL);
+
+ if (error == 0) {
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+
+ /*
+ * Since we probably don't have a hold on this snapshot,
+ * it's possible that the objsetid could have been destroyed
+ * and reused for a new objset. It's OK if this happens during
+ * a zfs send operation, since the new createtxg will be
+ * beyond the range we're interested in.
+ */
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ if (error == ENOENT) {
+ /* Racing with destroy, get the next one. */
+ *strchr(zc->zc_name, '@') = '\0';
+ dmu_objset_rele(os, FTAG);
+ goto top;
+ }
+ } else {
+ objset_t *ossnap;
+
+ error = dmu_objset_from_ds(ds, &ossnap);
+ if (error == 0)
+ error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ } else if (error == ENOENT) {
error = ESRCH;
+ }
+ dmu_objset_rele(os, FTAG);
/* if we failed, undo the @ that we tacked on to zc_name */
if (error)
*strchr(zc->zc_name, '@') = '\0';
return (error);
}
-int
-zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
{
- nvpair_t *elem;
- int error = 0;
- uint64_t intval;
- char *strval;
- nvlist_t *genericnvl;
- boolean_t issnap = (strchr(name, '@') != NULL);
+ const char *propname = nvpair_name(pair);
+ uint64_t *valary;
+ unsigned int vallen;
+ const char *domain;
+ char *dash;
+ zfs_userquota_prop_t type;
+ uint64_t rid;
+ uint64_t quota;
+ zfsvfs_t *zfsvfs;
+ int err;
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) != 0)
+ return (EINVAL);
+ }
/*
- * First validate permission to set all of the properties
+ * A correctly constructed propname is encoded as
+ * userquota@<rid>-<domain>.
*/
- VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
+ if ((dash = strchr(propname, '-')) == NULL ||
+ nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+ vallen != 3)
+ return (EINVAL);
- if (prop == ZPROP_INVAL) {
- /*
- * If this is a user-defined property, it must be a
- * string, and there is no further validation to do.
- */
- if (zfs_prop_user(propname) &&
- nvpair_type(elem) == DATA_TYPE_STRING) {
- if (error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_USERPROP, CRED()))
- return (error);
- continue;
- }
+ domain = dash + 1;
+ type = valary[0];
+ rid = valary[1];
+ quota = valary[2];
- if (!issnap && zfs_prop_userquota(propname) &&
- nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) {
- const char *perm;
- const char *up = zfs_userquota_prop_prefixes
- [ZFS_PROP_USERQUOTA];
- if (strncmp(propname, up, strlen(up)) == 0)
- perm = ZFS_DELEG_PERM_USERQUOTA;
- else
- perm = ZFS_DELEG_PERM_GROUPQUOTA;
- if (error = zfs_secpolicy_write_perms(name,
- perm, CRED()))
- return (error);
- continue;
- }
+ err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
+ if (err == 0) {
+ err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+ zfsvfs_rele(zfsvfs, FTAG);
+ }
- return (EINVAL);
- }
+ return (err);
+}
- if (issnap)
- return (EINVAL);
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err;
- if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
- return (error);
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_userquota(propname))
+ return (zfs_prop_set_userquota(dsname, pair));
+ return (-1);
+ }
- /*
- * Check that this value is valid for this pool version
- */
- switch (prop) {
- case ZFS_PROP_COMPRESSION:
- /*
- * If the user specified gzip compression, make sure
- * the SPA supports it. We ignore any errors here since
- * we'll catch them later.
- */
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0) {
- if (intval >= ZIO_COMPRESS_GZIP_1 &&
- intval <= ZIO_COMPRESS_GZIP_9 &&
- zfs_earlier_version(name,
- SPA_VERSION_GZIP_COMPRESSION))
- return (ENOTSUP);
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
- /*
- * If this is a bootable dataset then
- * verify that the compression algorithm
- * is supported for booting. We must return
- * something other than ENOTSUP since it
- * implies a downrev pool version.
- */
- if (zfs_is_bootfs(name) &&
- !BOOTFS_COMPRESS_VALID(intval))
- return (ERANGE);
- }
- break;
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
+ return (-1);
- case ZFS_PROP_COPIES:
- if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS))
- return (ENOTSUP);
- break;
+ VERIFY(0 == nvpair_value_uint64(pair, &intval));
- case ZFS_PROP_SHARESMB:
- if (zpl_earlier_version(name, ZPL_VERSION_FUID))
- return (ENOTSUP);
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ err = dsl_dir_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFQUOTA:
+ err = dsl_dataset_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_RESERVATION:
+ err = dsl_dir_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFRESERVATION:
+ err = dsl_dataset_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_VOLSIZE:
+ err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
+ intval);
+ break;
+ case ZFS_PROP_VERSION:
+ {
+ zfsvfs_t *zfsvfs;
+
+ if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
break;
- case ZFS_PROP_ACLINHERIT:
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0)
- if (intval == ZFS_ACL_PASSTHROUGH_X &&
- zfs_earlier_version(name,
- SPA_VERSION_PASSTHROUGH_X))
- return (ENOTSUP);
- }
- }
+ err = zfs_set_version(zfsvfs, intval);
+ zfsvfs_rele(zfsvfs, FTAG);
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
+ if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+ zfs_cmd_t *zc;
- if (prop == ZPROP_INVAL) {
- if (zfs_prop_userquota(propname)) {
- uint64_t *valary;
- unsigned int vallen;
- const char *domain;
- zfs_userquota_prop_t type;
- uint64_t rid;
- uint64_t quota;
- zfsvfs_t *zfsvfs;
-
- VERIFY(nvpair_value_uint64_array(elem,
- &valary, &vallen) == 0);
- VERIFY(vallen == 3);
- type = valary[0];
- rid = valary[1];
- quota = valary[2];
- domain = propname +
- strlen(zfs_userquota_prop_prefixes[type]);
-
- error = zfsvfs_hold(name, B_FALSE, FTAG,
- &zfsvfs);
- if (error == 0) {
- error = zfs_set_userquota(zfsvfs,
- type, domain, rid, quota);
- zfsvfs_rele(zfsvfs, FTAG);
- }
- if (error == 0)
- continue;
- else
- goto out;
- } else if (zfs_prop_user(propname)) {
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- error = dsl_prop_set(name, propname, 1,
- strlen(strval) + 1, strval);
- if (error == 0)
- continue;
- else
- goto out;
- }
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strcpy(zc->zc_name, dsname);
+ (void) zfs_ioc_userspace_upgrade(zc);
+ kmem_free(zc, sizeof (zfs_cmd_t));
}
+ break;
+ }
- switch (prop) {
- case ZFS_PROP_QUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_quota(name, intval)) != 0)
- goto out;
- break;
-
- case ZFS_PROP_REFQUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_quota(name, intval)) != 0)
- goto out;
- break;
-
- case ZFS_PROP_RESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_reservation(name,
- intval)) != 0)
- goto out;
- break;
+ default:
+ err = -1;
+ }
- case ZFS_PROP_REFRESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_reservation(name,
- intval)) != 0)
- goto out;
- break;
+ return (err);
+}
- case ZFS_PROP_VOLSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volsize(name,
- ddi_driver_major(zfs_dip), intval)) != 0)
- goto out;
- break;
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the first error
+ * encountered. If the caller provides a non-NULL errlist, it also gives the
+ * complete list of names of all the properties it failed to set along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property is set successfully, zero is returned and the list pointed
+ * at by errlist is NULL.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+ nvlist_t **errlist)
+{
+ nvpair_t *pair;
+ nvpair_t *propval;
+ int rv = 0;
+ uint64_t intval;
+ char *strval;
+ nvlist_t *genericnvl;
+ nvlist_t *errors;
+ nvlist_t *retrynvl;
- case ZFS_PROP_VOLBLOCKSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volblocksize(name, intval)) != 0)
- goto out;
- break;
+ VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- case ZFS_PROP_VERSION:
- {
- zfsvfs_t *zfsvfs;
-
- if ((error = nvpair_value_uint64(elem, &intval)) != 0)
- goto out;
- if ((error = zfsvfs_hold(name, B_FALSE, FTAG,
- &zfsvfs)) != 0)
- goto out;
- error = zfs_set_version(zfsvfs, intval);
- zfsvfs_rele(zfsvfs, FTAG);
-
- if (error == 0 && intval >= ZPL_VERSION_USERSPACE) {
- zfs_cmd_t zc = { 0 };
- (void) strcpy(zc.zc_name, name);
- (void) zfs_ioc_userspace_upgrade(&zc);
- }
- if (error)
- goto out;
- break;
+retry:
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ int err = 0;
+
+ /* decode the property value */
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) != 0)
+ err = EINVAL;
}
- default:
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- if (zfs_prop_get_type(prop) !=
- PROP_TYPE_STRING) {
- error = EINVAL;
- goto out;
- }
- } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ /* Validate value type */
+ if (err == 0 && prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (nvpair_type(propval) != DATA_TYPE_STRING)
+ err = EINVAL;
+ } else if (zfs_prop_userquota(propname)) {
+ if (nvpair_type(propval) !=
+ DATA_TYPE_UINT64_ARRAY)
+ err = EINVAL;
+ }
+ } else if (err == 0) {
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+ err = EINVAL;
+ } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
const char *unused;
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+ VERIFY(nvpair_value_uint64(propval,
+ &intval) == 0);
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
break;
case PROP_TYPE_STRING:
- error = EINVAL;
- goto out;
+ err = EINVAL;
+ break;
case PROP_TYPE_INDEX:
if (zfs_prop_index_to_string(prop,
- intval, &unused) != 0) {
- error = EINVAL;
- goto out;
- }
+ intval, &unused) != 0)
+ err = EINVAL;
break;
default:
cmn_err(CE_PANIC,
"unknown property type");
- break;
}
} else {
- error = EINVAL;
- goto out;
+ err = EINVAL;
+ }
+ }
+
+ /* Validate permissions */
+ if (err == 0)
+ err = zfs_check_settable(dsname, pair, CRED());
+
+ if (err == 0) {
+ err = zfs_prop_set_special(dsname, source, pair);
+ if (err == -1) {
+ /*
+ * For better performance we build up a list of
+ * properties to set in a single transaction.
+ */
+ err = nvlist_add_nvpair(genericnvl, pair);
+ } else if (err != 0 && nvl != retrynvl) {
+ /*
+ * This may be a spurious error caused by
+ * receiving quota and reservation out of order.
+ * Try again in a second pass.
+ */
+ err = nvlist_add_nvpair(retrynvl, pair);
}
- if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0)
- goto out;
}
+
+ if (err != 0)
+ VERIFY(nvlist_add_int32(errors, propname, err) == 0);
}
- if (nvlist_next_nvpair(genericnvl, NULL) != NULL) {
- error = dsl_props_set(name, genericnvl);
+ if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+ nvl = retrynvl;
+ goto retry;
+ }
+
+ if (!nvlist_empty(genericnvl) &&
+ dsl_props_set(dsname, source, genericnvl) != 0) {
+ /*
+ * If this fails, we still want to set as many properties as we
+ * can, so try setting them individually.
+ */
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ int err = 0;
+
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) == 0);
+ }
+
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ VERIFY(nvpair_value_string(propval,
+ &strval) == 0);
+ err = dsl_prop_set(dsname, propname, source, 1,
+ strlen(strval) + 1, strval);
+ } else {
+ VERIFY(nvpair_value_uint64(propval,
+ &intval) == 0);
+ err = dsl_prop_set(dsname, propname, source, 8,
+ 1, &intval);
+ }
+
+ if (err != 0) {
+ VERIFY(nvlist_add_int32(errors, propname,
+ err) == 0);
+ }
+ }
}
-out:
nvlist_free(genericnvl);
- return (error);
+ nvlist_free(retrynvl);
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
}
/*
@@ -1881,15 +2369,15 @@ out:
static int
zfs_check_userprops(char *fsname, nvlist_t *nvl)
{
- nvpair_t *elem = NULL;
+ nvpair_t *pair = NULL;
int error = 0;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
char *valstr;
if (!zfs_prop_user(propname) ||
- nvpair_type(elem) != DATA_TYPE_STRING)
+ nvpair_type(pair) != DATA_TYPE_STRING)
return (EINVAL);
if (error = zfs_secpolicy_write_perms(fsname,
@@ -1899,49 +2387,96 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl)
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG);
- VERIFY(nvpair_value_string(elem, &valstr) == 0);
+ VERIFY(nvpair_value_string(pair, &valstr) == 0);
if (strlen(valstr) >= ZAP_MAXVALUELEN)
return (E2BIG);
}
return (0);
}
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+ nvpair_t *pair;
+
+ VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+ if (nvlist_exists(skipped, nvpair_name(pair)))
+ continue;
+
+ VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+ }
+}
+
+static int
+clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+ nvlist_t *skipped)
+{
+ int err = 0;
+ nvlist_t *cleared_props = NULL;
+ props_skip(props, skipped, &cleared_props);
+ if (!nvlist_empty(cleared_props)) {
+ /*
+ * Acts on local properties until the dataset has received
+ * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+ */
+ zprop_source_t flags = (ZPROP_SRC_NONE |
+ (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
+ err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+ }
+ nvlist_free(cleared_props);
+ return (err);
+}
+
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to set
* zc_nvlist_src{_size} nvlist of properties to apply
- * zc_cookie clear existing local props?
+ * zc_cookie received properties flag
*
- * outputs: none
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
*/
static int
zfs_ioc_set_prop(zfs_cmd_t *zc)
{
nvlist_t *nvl;
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+ ZPROP_SRC_LOCAL);
+ nvlist_t *errors = NULL;
int error;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvl)) != 0)
+ zc->zc_iflags, &nvl)) != 0)
return (error);
- if (zc->zc_cookie) {
+ if (received) {
nvlist_t *origprops;
objset_t *os;
- if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
- clear_props(zc->zc_name, origprops, nvl);
+ if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
+ if (dsl_prop_get_received(os, &origprops) == 0) {
+ (void) clear_received_props(os,
+ zc->zc_name, origprops, nvl);
nvlist_free(origprops);
}
- dmu_objset_close(os);
- }
+ dsl_prop_set_hasrecvd(os);
+ dmu_objset_rele(os, FTAG);
+ }
}
- error = zfs_set_prop_nvlist(zc->zc_name, nvl);
+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors);
+ if (zc->zc_nvlist_dst != 0 && errors != NULL) {
+ (void) put_nvlist(zc, errors);
+ }
+
+ nvlist_free(errors);
nvlist_free(nvl);
return (error);
}
@@ -1950,14 +2485,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
* inputs:
* zc_name name of filesystem
* zc_value name of property to inherit
+ * zc_cookie revert to received value if TRUE
*
* outputs: none
*/
static int
zfs_ioc_inherit_prop(zfs_cmd_t *zc)
{
+ const char *propname = zc->zc_value;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received
+ ? ZPROP_SRC_NONE /* revert to received value, if any */
+ : ZPROP_SRC_INHERITED); /* explicitly inherit */
+
+ if (received) {
+ nvlist_t *dummy;
+ nvpair_t *pair;
+ zprop_type_t type;
+ int err;
+
+ /*
+ * zfs_prop_set_special() expects properties in the form of an
+ * nvpair with type info.
+ */
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(propname))
+ return (EINVAL);
+
+ type = PROP_TYPE_STRING;
+ } else if (prop == ZFS_PROP_VOLSIZE ||
+ prop == ZFS_PROP_VERSION) {
+ return (EINVAL);
+ } else {
+ type = zfs_prop_get_type(prop);
+ }
+
+ VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ switch (type) {
+ case PROP_TYPE_STRING:
+ VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+ break;
+ case PROP_TYPE_NUMBER:
+ case PROP_TYPE_INDEX:
+ VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+ break;
+ default:
+ nvlist_free(dummy);
+ return (EINVAL);
+ }
+
+ pair = nvlist_next_nvpair(dummy, NULL);
+ err = zfs_prop_set_special(zc->zc_name, source, pair);
+ nvlist_free(dummy);
+ if (err != -1)
+ return (err); /* special property already handled */
+ } else {
+ /*
+ * Only check this in the non-received case. We want to allow
+ * 'inherit -S' to revert non-inheritable properties like quota
+ * and reservation to the received or default values even though
+ * they are not considered inheritable.
+ */
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ return (EINVAL);
+ }
+
/* the property name has been validated by zfs_secpolicy_inherit() */
- return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+ return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
}
static int
@@ -1966,28 +2562,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
nvlist_t *props;
spa_t *spa;
int error;
- nvpair_t *elem;
+ nvpair_t *pair;
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)))
+ if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))
return (error);
/*
* If the only property is the configfile, then just do a spa_lookup()
* to handle the faulted case.
*/
- elem = nvlist_next_nvpair(props, NULL);
- if (elem != NULL && strcmp(nvpair_name(elem),
+ pair = nvlist_next_nvpair(props, NULL);
+ if (pair != NULL && strcmp(nvpair_name(pair),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
- nvlist_next_nvpair(props, elem) == NULL) {
+ nvlist_next_nvpair(props, pair) == NULL) {
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_config_sync(spa, B_FALSE, B_TRUE);
}
mutex_exit(&spa_namespace_lock);
- if (spa != NULL)
+ if (spa != NULL) {
+ nvlist_free(props);
return (0);
+ }
}
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
@@ -2034,57 +2632,6 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
return (error);
}
-static int
-zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
-{
-#ifdef sun
- nvlist_t *nvp;
- int error;
- uint32_t uid;
- uint32_t gid;
- uint32_t *groups;
- uint_t group_cnt;
- cred_t *usercred;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvp)) != 0) {
- return (error);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_UID, &uid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_GID, &gid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
- &groups, &group_cnt)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
- usercred = cralloc();
- if ((crsetugid(usercred, uid, gid) != 0) ||
- (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
- nvlist_free(nvp);
- crfree(usercred);
- return (EPERM);
- }
- nvlist_free(nvp);
- error = dsl_deleg_access(zc->zc_name,
- zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
- crfree(usercred);
- return (error);
-#else /* sun */
- return (EPERM);
-#endif /* sun */
-}
-
/*
* inputs:
* zc_name name of filesystem
@@ -2100,7 +2647,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
nvlist_t *fsaclnv = NULL;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &fsaclnv)) != 0)
+ zc->zc_iflags, &fsaclnv)) != 0)
return (error);
/*
@@ -2157,30 +2704,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
}
/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
- return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
- return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
* Search the vfs list for a specified resource. Returns a pointer to it
* or NULL if no suitable entry is found. The caller of this routine
* is responsible for releasing the returned vfs pointer.
@@ -2234,8 +2757,8 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
*/
static int
zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
- boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
- boolean_t *is_ci)
+ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
{
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
@@ -2271,6 +2794,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
(zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+ (zplver >= ZPL_VERSION_SA && !sa_ok) ||
(zplver < ZPL_VERSION_NORMALIZATION &&
(norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
sense != ZFS_PROP_UNDEFINED)))
@@ -2312,11 +2836,13 @@ static int
zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
char parentname[MAXNAMELEN];
char *cp;
+ spa_t *spa;
+ uint64_t spa_vers;
int error;
(void) strlcpy(parentname, dataset, sizeof (parentname));
@@ -2324,23 +2850,25 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
ASSERT(cp != NULL);
cp[0] = '\0';
- if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
- zplver = ZPL_VERSION_USERSPACE - 1;
- if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
+ if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_vers = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
/*
* Open parent object set so we can inherit zplprop values.
*/
- if ((error = dmu_objset_open(parentname, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os)) != 0)
+ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
return (error);
- error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
zplprops, is_ci);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -2348,17 +2876,17 @@ static int
zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok;
+ boolean_t sa_ok;
uint64_t zplver = ZPL_VERSION;
int error;
- if (spa_vers < SPA_VERSION_FUID) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
- error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
- zplprops, is_ci);
+ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+ createprops, zplprops, is_ci);
return (error);
}
@@ -2401,7 +2929,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
zct.zct_zplprops = NULL;
@@ -2417,21 +2945,18 @@ zfs_ioc_create(zfs_cmd_t *zc)
return (EINVAL);
}
- error = dmu_objset_open(zc->zc_value, type,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(zc->zc_value, FTAG, &clone);
if (error) {
nvlist_free(nvprops);
return (error);
}
- error = dmu_objset_create(zc->zc_name, type, clone, 0,
- NULL, NULL);
+ error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
+ dmu_objset_rele(clone, FTAG);
if (error) {
- dmu_objset_close(clone);
nvlist_free(nvprops);
return (error);
}
- dmu_objset_close(clone);
} else {
boolean_t is_insensitive = B_FALSE;
@@ -2488,7 +3013,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
return (error);
}
}
- error = dmu_objset_create(zc->zc_name, type, NULL,
+ error = dmu_objset_create(zc->zc_name, type,
is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
nvlist_free(zct.zct_zplprops);
}
@@ -2497,10 +3022,16 @@ zfs_ioc_create(zfs_cmd_t *zc)
* It would be nice to do this atomically.
*/
if (error == 0) {
- if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
- (void) dmu_objset_destroy(zc->zc_name);
+ error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL,
+ nvprops, NULL);
+ if (error != 0)
+ (void) dmu_objset_destroy(zc->zc_name, B_FALSE);
}
nvlist_free(nvprops);
+#ifdef __FreeBSD__
+ if (error == 0 && type == DMU_OST_ZVOL)
+ zvol_create_minors(zc->zc_name);
+#endif
return (error);
}
@@ -2511,7 +3042,8 @@ zfs_ioc_create(zfs_cmd_t *zc)
* zc_cookie recursive flag
* zc_nvlist_src[_size] property list
*
- * outputs: none
+ * outputs:
+ * zc_value short snapname (i.e. part after the '@')
*/
static int
zfs_ioc_snapshot(zfs_cmd_t *zc)
@@ -2525,21 +3057,21 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
error = zfs_check_userprops(zc->zc_name, nvprops);
if (error)
goto out;
- if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL &&
+ if (!nvlist_empty(nvprops) &&
zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
error = ENOTSUP;
goto out;
}
- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
- nvprops, recursive);
+ error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL,
+ nvprops, recursive, B_FALSE, -1);
out:
nvlist_free(nvprops);
@@ -2547,20 +3079,15 @@ out:
}
int
-zfs_unmount_snap(char *name, void *arg)
+zfs_unmount_snap(const char *name, void *arg)
{
vfs_t *vfsp = NULL;
if (arg) {
char *snapname = arg;
- int len = strlen(name) + strlen(snapname) + 2;
- char *buf = kmem_alloc(len, KM_SLEEP);
-
- (void) strcpy(buf, name);
- (void) strcat(buf, "@");
- (void) strcat(buf, snapname);
- vfsp = zfs_get_vfs(buf);
- kmem_free(buf, len);
+ char *fullname = kmem_asprintf("%s@%s", name, snapname);
+ vfsp = zfs_get_vfs(fullname);
+ strfree(fullname);
} else if (strchr(name, '@')) {
vfsp = zfs_get_vfs(name);
}
@@ -2586,8 +3113,9 @@ zfs_unmount_snap(char *name, void *arg)
/*
* inputs:
- * zc_name name of filesystem
- * zc_value short name of snapshot
+ * zc_name name of filesystem
+ * zc_value short name of snapshot
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
@@ -2602,26 +3130,32 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
if (err)
return (err);
- return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+ return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
+ zc->zc_defer_destroy));
}
/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
static int
zfs_ioc_destroy(zfs_cmd_t *zc)
{
+ int err;
if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
+ err = zfs_unmount_snap(zc->zc_name, NULL);
if (err)
return (err);
}
- return (dmu_objset_destroy(zc->zc_name));
+ err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+ if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+ (void) zvol_remove_minor(zc->zc_name);
+ return (err);
}
/*
@@ -2633,38 +3167,78 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
static int
zfs_ioc_rollback(zfs_cmd_t *zc)
{
- objset_t *os;
+ dsl_dataset_t *ds, *clone;
int error;
- zfsvfs_t *zfsvfs = NULL;
+ zfsvfs_t *zfsvfs;
+ char *clone_name;
+
+ error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
+ if (error)
+ return (error);
+
+ /* must not be a snapshot */
+ if (dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EINVAL);
+ }
+
+ /* must have a most recent snapshot */
+ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EINVAL);
+ }
/*
- * Get the zfsvfs for the receiving objset. There
- * won't be one if we're operating on a zvol, if the
- * objset doesn't exist yet, or is not mounted.
+ * Create clone of most recent snapshot.
*/
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+ clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
+ error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
if (error)
- return (error);
+ goto out;
- if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
- int mode;
+ error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
+ if (error)
+ goto out;
- error = zfs_suspend_fs(zfsvfs, NULL, &mode);
+ /*
+ * Do clone swap.
+ */
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
int resume_err;
- error = dmu_objset_rollback(os);
- resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
+ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+ error = dsl_dataset_clone_swap(clone, ds,
+ B_TRUE);
+ dsl_dataset_disown(ds, FTAG);
+ ds = NULL;
+ } else {
+ error = EBUSY;
+ }
+ resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
error = error ? error : resume_err;
- } else {
- dmu_objset_close(os);
}
VFS_RELE(zfsvfs->z_vfs);
} else {
- error = dmu_objset_rollback(os);
+ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+ error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
+ dsl_dataset_disown(ds, FTAG);
+ ds = NULL;
+ } else {
+ error = EBUSY;
+ }
}
- /* Note, the dmu_objset_rollback() releases the objset for us. */
+ /*
+ * Destroy clone (which also closes it).
+ */
+ (void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+
+out:
+ strfree(clone_name);
+ if (ds)
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
@@ -2697,31 +3271,267 @@ zfs_ioc_rename(zfs_cmd_t *zc)
if (err)
return (err);
}
+ if (zc->zc_objset_type == DMU_OST_ZVOL)
+ (void) zvol_remove_minor(zc->zc_name);
return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
}
-static void
-clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+ const char *propname = nvpair_name(pair);
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (err = zfs_secpolicy_write_perms(dsname,
+ ZFS_DELEG_PERM_USERPROP, cr))
+ return (err);
+ return (0);
+ }
+
+ if (!issnap && zfs_prop_userquota(propname)) {
+ const char *perm = NULL;
+ const char *uq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+ const char *gq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+
+ if (strncmp(propname, uq_prefix,
+ strlen(uq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_USERQUOTA;
+ } else if (strncmp(propname, gq_prefix,
+ strlen(gq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_GROUPQUOTA;
+ } else {
+ /* USERUSED and GROUPUSED are read-only */
+ return (EINVAL);
+ }
+
+ if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
+ return (err);
+ return (0);
+ }
+
+ return (EINVAL);
+ }
+
+ if (issnap)
+ return (EINVAL);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ /*
+ * Check that this value is valid for this pool version
+ */
+ switch (prop) {
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval >= ZIO_COMPRESS_GZIP_1 &&
+ intval <= ZIO_COMPRESS_GZIP_9 &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_GZIP_COMPRESSION)) {
+ return (ENOTSUP);
+ }
+
+ if (intval == ZIO_COMPRESS_ZLE &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_ZLE_COMPRESSION))
+ return (ENOTSUP);
+
+ /*
+ * If this is a bootable dataset then
+ * verify that the compression algorithm
+ * is supported for booting. We must return
+ * something other than ENOTSUP since it
+ * implies a downrev pool version.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ !BOOTFS_COMPRESS_VALID(intval)) {
+ return (ERANGE);
+ }
+ }
+ break;
+
+ case ZFS_PROP_COPIES:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_DEDUP:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_SHARESMB:
+ if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_ACLINHERIT:
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval == ZFS_ACL_PASSTHROUGH_X &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_PASSTHROUGH_X))
+ return (ENOTSUP);
+ }
+ break;
+ }
+
+ return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
{
zfs_cmd_t *zc;
- nvpair_t *prop;
+ nvpair_t *pair, *next_pair;
+ nvlist_t *errors;
+ int err, rv = 0;
if (props == NULL)
- return;
+ return (0);
+
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dataset);
- for (prop = nvlist_next_nvpair(props, NULL); prop;
- prop = nvlist_next_nvpair(props, prop)) {
- if (newprops != NULL &&
- nvlist_exists(newprops, nvpair_name(prop)))
- continue;
- (void) strcpy(zc->zc_value, nvpair_name(prop));
- if (zfs_secpolicy_inherit(zc, CRED()) == 0)
- (void) zfs_ioc_inherit_prop(zc);
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ (void) strcpy(zc->zc_value, nvpair_name(pair));
+ if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+ (err = zfs_secpolicy_inherit(zc, CRED())) != 0) {
+ VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+ VERIFY(nvlist_add_int32(errors,
+ zc->zc_value, err) == 0);
+ }
+ pair = next_pair;
}
kmem_free(zc, sizeof (zfs_cmd_t));
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+ if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+ /* dsl_prop_get_all_impl() format */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p1) == 0);
+ }
+
+ if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p2) == 0);
+ }
+
+ if (nvpair_type(p1) != nvpair_type(p2))
+ return (B_FALSE);
+
+ if (nvpair_type(p1) == DATA_TYPE_STRING) {
+ char *valstr1, *valstr2;
+
+ VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+ VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+ return (strcmp(valstr1, valstr2) == 0);
+ } else {
+ uint64_t intval1, intval2;
+
+ VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+ VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+ return (intval1 == intval2);
+ }
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+ nvpair_t *pair, *next_pair;
+
+ if (origprops == NULL)
+ return; /* all props need to be received */
+
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ const char *propname = nvpair_name(pair);
+ nvpair_t *match;
+
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ if ((nvlist_lookup_nvpair(origprops, propname,
+ &match) != 0) || !propval_equals(pair, match))
+ goto next; /* need to set received value */
+
+ /* don't clear the existing received value */
+ (void) nvlist_remove_nvpair(origprops, match);
+ /* don't bother receiving the property */
+ (void) nvlist_remove_nvpair(props, pair);
+next:
+ pair = next_pair;
+ }
}
+#ifdef DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
/*
* inputs:
* zc_name name of containing filesystem
@@ -2731,9 +3541,14 @@ clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
* zc_cookie file descriptor to recv from
* zc_begin_record the BEGIN record of the stream (not byteswapped)
* zc_guid force flag
+ * zc_cleanup_fd cleanup-on-exit file descriptor
+ * zc_action_handle handle for this guid/ds mapping (or zero on first call)
*
* outputs:
* zc_cookie number of bytes read
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ * zc_obj zprop_errflags_t
+ * zc_action_handle handle for this guid/ds mapping
*/
static int
zfs_ioc_recv(zfs_cmd_t *zc)
@@ -2741,15 +3556,18 @@ zfs_ioc_recv(zfs_cmd_t *zc)
file_t *fp;
objset_t *os;
dmu_recv_cookie_t drc;
- zfsvfs_t *zfsvfs = NULL;
boolean_t force = (boolean_t)zc->zc_guid;
- int error, fd;
+ int fd;
+ int error = 0;
+ int props_error = 0;
+ nvlist_t *errors;
offset_t off;
- nvlist_t *props = NULL;
- nvlist_t *origprops = NULL;
+ nvlist_t *props = NULL; /* sent properties */
+ nvlist_t *origprops = NULL; /* existing properties */
objset_t *origin = NULL;
char *tosnap;
char tofs[ZFS_MAXNAMELEN];
+ boolean_t first_recvd_props = B_FALSE;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '@') == NULL ||
@@ -2758,123 +3576,204 @@ zfs_ioc_recv(zfs_cmd_t *zc)
(void) strcpy(tofs, zc->zc_value);
tosnap = strchr(tofs, '@');
- *tosnap = '\0';
- tosnap++;
+ *tosnap++ = '\0';
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)) != 0)
+ zc->zc_iflags, &props)) != 0)
return (error);
fd = zc->zc_cookie;
- fp = getf(fd, 0);
+ fp = getf(fd);
if (fp == NULL) {
nvlist_free(props);
return (EBADF);
}
- if (getzfsvfs(tofs, &zfsvfs) == 0) {
- if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
- VFS_RELE(zfsvfs->z_vfs);
- zfsvfs = NULL;
- error = EBUSY;
- goto out;
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
+ if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
+ !dsl_prop_get_hasrecvd(os)) {
+ first_recvd_props = B_TRUE;
}
+
/*
- * If new properties are supplied, they are to completely
- * replace the existing ones, so stash away the existing ones.
- */
- if (props)
- (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
- } else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- /*
- * Get the props even if there was no zfsvfs (zvol or
- * unmounted zpl).
+ * If new received properties are supplied, they are to
+ * completely replace the existing received properties, so stash
+ * away the existing ones.
*/
- (void) dsl_prop_get_all(os, &origprops, TRUE);
+ if (dsl_prop_get_received(os, &origprops) == 0) {
+ nvlist_t *errlist = NULL;
+ /*
+ * Don't bother writing a property if its value won't
+ * change (and avoid the unnecessary security checks).
+ *
+ * The first receive after SPA_VERSION_RECVD_PROPS is a
+ * special case where we blow away all local properties
+ * regardless.
+ */
+ if (!first_recvd_props)
+ props_reduce(props, origprops);
+ if (zfs_check_clearable(tofs, origprops,
+ &errlist) != 0)
+ (void) nvlist_merge(errors, errlist, 0);
+ nvlist_free(errlist);
+ }
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
if (zc->zc_string[0]) {
- error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &origin);
+ error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
if (error)
goto out;
}
- error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
- force, origin, zfsvfs != NULL, &drc);
+ error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
+ &zc->zc_begin_record, force, origin, &drc);
if (origin)
- dmu_objset_close(origin);
+ dmu_objset_rele(origin, FTAG);
if (error)
goto out;
/*
- * Reset properties. We do this before we receive the stream
- * so that the properties are applied to the new data.
+ * Set properties before we receive the stream so that they are applied
+ * to the new data. Note that we must call dmu_recv_stream() if
+ * dmu_recv_begin() succeeds.
*/
if (props) {
- clear_props(tofs, origprops, props);
+ nvlist_t *errlist;
+
+ if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
+ if (drc.drc_newfs) {
+ if (spa_version(os->os_spa) >=
+ SPA_VERSION_RECVD_PROPS)
+ first_recvd_props = B_TRUE;
+ } else if (origprops != NULL) {
+ if (clear_received_props(os, tofs, origprops,
+ first_recvd_props ? NULL : props) != 0)
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ } else {
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ }
+ dsl_prop_set_hasrecvd(os);
+ } else if (!drc.drc_newfs) {
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ }
+
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ props, &errlist);
+ (void) nvlist_merge(errors, errlist, 0);
+ nvlist_free(errlist);
+ }
+
+ if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) {
/*
- * XXX - Note, this is all-or-nothing; should be best-effort.
+ * Caller made zc->zc_nvlist_dst less than the minimum expected
+ * size or supplied an invalid address.
*/
- (void) zfs_set_prop_nvlist(tofs, props);
+ props_error = EINVAL;
}
off = fp->f_offset;
- error = dmu_recv_stream(&drc, fp, &off);
+ error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
+ &zc->zc_action_handle);
- if (error == 0 && zfsvfs) {
- char *osname;
- int mode;
+ if (error == 0) {
+ zfsvfs_t *zfsvfs = NULL;
- /* online recv */
- osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
- if (error == 0) {
- int resume_err;
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ /* online recv */
+ int end_err;
- error = dmu_recv_end(&drc);
- resume_err = zfs_resume_fs(zfsvfs, osname, mode);
- error = error ? error : resume_err;
+ error = zfs_suspend_fs(zfsvfs);
+ /*
+ * If the suspend fails, then the recv_end will
+ * likely also fail, and clean up after itself.
+ */
+ end_err = dmu_recv_end(&drc);
+ if (error == 0)
+ error = zfs_resume_fs(zfsvfs, tofs);
+ error = error ? error : end_err;
+ VFS_RELE(zfsvfs->z_vfs);
} else {
- dmu_recv_abort_cleanup(&drc);
+ error = dmu_recv_end(&drc);
}
- kmem_free(osname, MAXNAMELEN);
- } else if (error == 0) {
- error = dmu_recv_end(&drc);
}
zc->zc_cookie = off - fp->f_offset;
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
+#ifdef DEBUG
+ if (zfs_ioc_recv_inject_err) {
+ zfs_ioc_recv_inject_err = B_FALSE;
+ error = 1;
+ }
+#endif
/*
* On error, restore the original props.
*/
if (error && props) {
- clear_props(tofs, props, NULL);
- (void) zfs_set_prop_nvlist(tofs, origprops);
+ if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+ if (clear_received_props(os, tofs, props, NULL) != 0) {
+ /*
+ * We failed to clear the received properties.
+ * Since we may have left a $recvd value on the
+ * system, we can't clear the $hasrecvd flag.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ } else if (first_recvd_props) {
+ dsl_prop_unset_hasrecvd(os);
+ }
+ dmu_objset_rele(os, FTAG);
+ } else if (!drc.drc_newfs) {
+ /* We failed to clear the received properties. */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+
+ if (origprops == NULL && !drc.drc_newfs) {
+ /* We failed to stash the original properties. */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+
+ /*
+ * dsl_props_set() will not convert RECEIVED to LOCAL on or
+ * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+ * explictly if we're restoring local properties cleared in the
+ * first new-style receive.
+ */
+ if (origprops != NULL &&
+ zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+ ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+ origprops, NULL) != 0) {
+ /*
+ * We stashed the original properties but failed to
+ * restore them.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
}
out:
- if (zfsvfs) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
nvlist_free(props);
nvlist_free(origprops);
- releasef(fp);
+ nvlist_free(errors);
+ releasef(fd);
+
+ if (error == 0)
+ error = props_error;
+
return (error);
}
/*
* inputs:
* zc_name name of snapshot to send
- * zc_value short name of incremental fromsnap (may be empty)
* zc_cookie file descriptor to send stream to
- * zc_obj fromorigin flag (mutually exclusive with zc_value)
+ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj objsetid of snapshot to send
+ * zc_fromobj objsetid of incremental fromsnap (may be zero)
*
* outputs: none
*/
@@ -2886,36 +3785,55 @@ zfs_ioc_send(zfs_cmd_t *zc)
file_t *fp;
int error;
offset_t off;
+ dsl_dataset_t *ds;
+ dsl_dataset_t *dsfrom = NULL;
+ spa_t *spa;
+ dsl_pool_t *dp;
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &tosnap);
+ error = spa_open(zc->zc_name, &spa, FTAG);
if (error)
return (error);
- if (zc->zc_value[0] != '\0') {
- char *buf;
- char *cp;
-
- buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strncpy(buf, zc->zc_name, MAXPATHLEN);
- cp = strchr(buf, '@');
- if (cp)
- *(cp+1) = 0;
- (void) strlcat(buf, zc->zc_value, MAXPATHLEN);
- error = dmu_objset_open(buf, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &fromsnap);
- kmem_free(buf, MAXPATHLEN);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ error = dmu_objset_from_ds(ds, &tosnap);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_fromobj != 0) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
if (error) {
- dmu_objset_close(tosnap);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
+ error = dmu_objset_from_ds(dsfrom, &fromsnap);
+ if (error) {
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ } else {
+ spa_close(spa, FTAG);
}
- fp = getf(zc->zc_cookie, 1);
+ fp = getf(zc->zc_cookie);
if (fp == NULL) {
- dmu_objset_close(tosnap);
- if (fromsnap)
- dmu_objset_close(fromsnap);
+ dsl_dataset_rele(ds, FTAG);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
return (EBADF);
}
@@ -2924,10 +3842,10 @@ zfs_ioc_send(zfs_cmd_t *zc)
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
- releasef(fp);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- dmu_objset_close(tosnap);
+ releasef(zc->zc_cookie);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
@@ -3003,16 +3921,41 @@ zfs_ioc_clear(zfs_cmd_t *zc)
mutex_exit(&spa_namespace_lock);
return (EIO);
}
- if (spa->spa_log_state == SPA_LOG_MISSING) {
+ if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
/* we need to let spa_open/spa_load clear the chains */
- spa->spa_log_state = SPA_LOG_CLEAR;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
}
+ spa->spa_last_open_failed = 0;
mutex_exit(&spa_namespace_lock);
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ } else {
+ nvlist_t *policy;
+ nvlist_t *config = NULL;
+
+ if (zc->zc_nvlist_src == 0)
+ return (EINVAL);
+
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+ error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+ policy, &config);
+ if (config != NULL) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ nvlist_free(config);
+ }
+ nvlist_free(policy);
+ }
+ }
+
+ if (error)
return (error);
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if (zc->zc_guid == 0) {
vd = NULL;
@@ -3045,7 +3988,8 @@ zfs_ioc_clear(zfs_cmd_t *zc)
* zc_name name of filesystem
* zc_value name of origin snapshot
*
- * outputs: none
+ * outputs:
+ * zc_string name of conflicting snapshot, if there is one
*/
static int
zfs_ioc_promote(zfs_cmd_t *zc)
@@ -3061,7 +4005,7 @@ zfs_ioc_promote(zfs_cmd_t *zc)
*cp = '\0';
(void) dmu_objset_find(zc->zc_value,
zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
- return (dsl_dataset_promote(zc->zc_name));
+ return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
}
/*
@@ -3085,7 +4029,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (EINVAL);
- error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
@@ -3111,13 +4055,15 @@ static int
zfs_ioc_userspace_many(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
- int error;
+ int bufsize = zc->zc_nvlist_dst_size;
- error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ if (bufsize <= 0)
+ return (ENOMEM);
+
+ int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
- int bufsize = zc->zc_nvlist_dst_size;
void *buf = kmem_alloc(bufsize, KM_SLEEP);
error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
@@ -3145,34 +4091,31 @@ static int
zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
{
objset_t *os;
- int error;
+ int error = 0;
zfsvfs_t *zfsvfs;
if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
- if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) {
+ if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
/*
* If userused is not enabled, it may be because the
* objset needs to be closed & reopened (to grow the
* objset_phys_t). Suspend/resume the fs will do that.
*/
- int mode;
- error = zfs_suspend_fs(zfsvfs, NULL, &mode);
- if (error == 0) {
- error = zfs_resume_fs(zfsvfs,
- zc->zc_name, mode);
- }
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0)
+ error = zfs_resume_fs(zfsvfs, zc->zc_name);
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
VFS_RELE(zfsvfs->z_vfs);
} else {
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER, &os);
+ /* XXX kind of reading contents without owning */
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error)
return (error);
error = dmu_objset_userspace_upgrade(os);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
return (error);
@@ -3219,7 +4162,7 @@ zfs_init_sharefs()
}
return (0);
}
-#endif /* sun */
+#endif /* sun */
static int
zfs_ioc_share(zfs_cmd_t *zc)
@@ -3314,15 +4257,123 @@ zfs_ioc_share(zfs_cmd_t *zc)
zc->zc_share.z_sharemax);
return (error);
-#else /* sun */
+
+#else /* !sun */
return (ENOSYS);
-#endif /* sun */
+#endif /* !sun */
}
ace_t full_access[] = {
{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
};
+/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_obj object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error)
+ return (error);
+
+ error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
+ os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value prefix name for snapshot
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+ char *snap_name;
+ int error;
+
+ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+ (u_longlong_t)ddi_get_lbolt64());
+
+ if (strlen(snap_name) >= MAXNAMELEN) {
+ strfree(snap_name);
+ return (E2BIG);
+ }
+
+ error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name,
+ NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd);
+ if (error != 0) {
+ strfree(snap_name);
+ return (error);
+ }
+
+ (void) strcpy(zc->zc_value, snap_name);
+ strfree(snap_name);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of "to" snapshot
+ * zc_value name of "from" snapshot
+ * zc_cookie file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+ objset_t *fromsnap;
+ objset_t *tosnap;
+ file_t *fp;
+ offset_t off;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+ if (error)
+ return (error);
+
+ error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap);
+ if (error) {
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+ }
+
+ fp = getf(zc->zc_cookie);
+ if (fp == NULL) {
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (EBADF);
+ }
+
+ off = fp->f_offset;
+
+ error = dmu_diff(tosnap, fromsnap, fp, &off);
+
+ if (off >= 0 && off <= MAXOFFSET_T)
+ fp->f_offset = off;
+ releasef(zc->zc_cookie);
+
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+}
+
#ifdef sun
/*
* Remove all ACL files in shares dir
@@ -3368,7 +4419,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
/* Now make sure mntpnt and dataset are ZFS */
- if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
@@ -3377,7 +4428,6 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
dzp = VTOZ(vp);
zfsvfs = dzp->z_zfsvfs;
-
ZFS_ENTER(zfsvfs);
/*
@@ -3440,7 +4490,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
case ZFS_SMB_ACL_RENAME:
if ((error = get_nvlist(zc->zc_nvlist_src,
- zc->zc_nvlist_src_size, &nvlist)) != 0) {
+ zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
@@ -3451,6 +4501,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
+ nvlist_free(nvlist);
return (error);
}
error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
@@ -3479,6 +4530,127 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
}
/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this hold
+ * zc_cookie recursive flag
+ * zc_temphold set if hold is temporary
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ * zc_sendobj if non-zero, the objid for zc_name@zc_value
+ * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_hold(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+ minor_t minor = 0;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ if (zc->zc_sendobj == 0) {
+ return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive, zc->zc_temphold,
+ zc->zc_cleanup_fd));
+ }
+
+ if (recursive)
+ return (EINVAL);
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ /*
+ * Until we have a hold on this snapshot, it's possible that
+ * zc_sendobj could've been destroyed and reused as part
+ * of a later txg. Make sure we're looking at the right object.
+ */
+ if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOENT);
+ }
+
+ if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) {
+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ }
+
+ error = dsl_dataset_user_hold_for_send(ds, zc->zc_string,
+ zc->zc_temphold);
+ if (minor != 0) {
+ if (error == 0) {
+ dsl_register_onexit_hold_cleanup(ds, zc->zc_string,
+ minor);
+ }
+ zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of dataset from which we're releasing a user hold
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this hold
+ * zc_cookie recursive flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_release(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size} nvlist of snapshot holds
+ */
+static int
+zfs_ioc_get_holds(zfs_cmd_t *zc)
+{
+ nvlist_t *nvp;
+ int error;
+
+ if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
+ error = put_nvlist(zc, nvp);
+ nvlist_free(nvp);
+ }
+
+ return (error);
+}
+
+/*
* pool create, destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
* do the logging of those commands.
@@ -3514,7 +4686,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_FALSE },
{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
@@ -3534,6 +4706,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_TRUE },
{ zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE,
B_TRUE },
+ { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_TRUE },
{ zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_TRUE },
{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
@@ -3543,10 +4717,6 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_TRUE },
{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
- { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
- B_FALSE },
{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
B_TRUE},
@@ -3566,14 +4736,14 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
- B_TRUE },
+ { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
+ B_TRUE, B_TRUE },
{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE,
B_FALSE },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ B_TRUE },
{ zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
@@ -3582,13 +4752,9 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_TRUE },
{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
- B_FALSE },
{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
- { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
B_FALSE },
{ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
@@ -3597,15 +4763,30 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
DATASET_NAME, B_FALSE, B_FALSE },
{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
DATASET_NAME, B_FALSE, B_TRUE },
- { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_TRUE }
+ { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, B_FALSE },
+ { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME,
+ B_FALSE, B_FALSE },
+ { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
+ { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }
};
int
pool_status_check(const char *name, zfs_ioc_namecheck_t type)
{
spa_t *spa;
- char pool[ZFS_MAXNAMELEN];
int error;
ASSERT(type == POOL_NAME || type == DATASET_NAME);
@@ -3619,27 +4800,157 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type)
return (error);
}
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+ static minor_t last_minor;
+ minor_t m;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ for (m = last_minor + 1; m != last_minor; m++) {
+ if (m > ZFSDEV_MAX_MINOR)
+ m = 1;
+ if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+ last_minor = m;
+ return (m);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+ minor_t minor;
+ zfs_soft_state_t *zs;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (ENXIO);
+
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+ return (EAGAIN);
+
+ devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_CTLDEV;
+ zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+ return (0);
+}
+
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ zfs_onexit_destroy(zo);
+ ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+ zfs_soft_state_t *zp;
+
+ zp = ddi_get_soft_state(zfsdev_state, minor);
+ if (zp == NULL || zp->zss_type != which)
+ return (NULL);
+
+ return (zp->zss_data);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+ int error = 0;
+
+#ifdef sun
+ if (getminor(*devp) != 0)
+ return (zvol_open(devp, flag, otyp, cr));
+#endif
+
+ /* This is the control device. Allocate a new minor if requested. */
+ if (flag & FEXCL) {
+ mutex_enter(&zfsdev_state_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&zfsdev_state_lock);
+ }
+
+ return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+ zfs_onexit_t *zo;
+ minor_t minor = (minor_t)(uintptr_t)data;
+
+ if (minor == 0)
+ return;
+
+ mutex_enter(&zfsdev_state_lock);
+ zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (zo == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return;
+ }
+ zfs_ctldev_destroy(zo, minor);
+ mutex_exit(&zfsdev_state_lock);
+}
+
static int
zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
struct thread *td)
{
- zfs_cmd_t *zc = (void *)addr;
+ zfs_cmd_t *zc;
uint_t vec;
- int error;
+ int cflag, error, len;
+
+ cflag = ZFS_CMD_COMPAT_NONE;
+ len = IOCPARM_LEN(cmd);
/*
* Check if we have sufficient kernel memory allocated
* for the zfs_cmd_t request. Bail out if not so we
* will not access undefined memory region.
*/
- if (IOCPARM_LEN(cmd) < sizeof(zfs_cmd_t))
- return (EINVAL);
+ if (len < sizeof(zfs_cmd_t))
+ if (len == sizeof(zfs_cmd_v15_t)) {
+ cflag = ZFS_CMD_COMPAT_V15;
+ vec = zfs_ioctl_v15_to_v28[ZFS_IOC(cmd)];
+ } else
+ return (EINVAL);
+ else
+ vec = ZFS_IOC(cmd);
- vec = ZFS_IOC(cmd);
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ if (vec == ZFS_IOC_COMPAT_PASS)
+ return (0);
+ else if (vec == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+ }
if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
return (EINVAL);
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+ bzero(zc, sizeof(zfs_cmd_t));
+ zfs_cmd_compat_get(zc, addr, cflag);
+ zfs_ioctl_compat_pre(zc, &vec, cflag);
+ } else {
+ zc = (void *)addr;
+ }
+
error = zfs_ioc_vec[vec].zvec_secpolicy(zc, td->td_ucred);
/*
@@ -3648,6 +4959,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
*/
if (error == 0) {
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ zc->zc_iflags = flag & FKIOCTL;
switch (zfs_ioc_vec[vec].zvec_namecheck) {
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
@@ -3678,9 +4990,68 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
zfs_log_history(zc);
}
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ zfs_ioctl_compat_post(zc, ZFS_IOC(cmd), cflag);
+ zfs_cmd_compat_put(zc, addr, cflag);
+ kmem_free(zc, sizeof(zfs_cmd_t));
+ }
+
return (error);
}
+#ifdef sun
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+ DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ zfs_dip = dip;
+
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy())
+ return (DDI_FAILURE);
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ zfs_dip = NULL;
+
+ ddi_prop_remove_all(dip);
+ ddi_remove_minor_node(dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = zfs_dip;
+ return (DDI_SUCCESS);
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ return (DDI_SUCCESS);
+ }
+
+ return (DDI_FAILURE);
+}
+#endif /* sun */
+
/*
* OK, so this is a little weird.
*
@@ -3690,8 +5061,60 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
* /dev/zfs has basically nothing to do except serve up ioctls,
* so most of the standard driver entry points are in zvol.c.
*/
+#ifdef sun
+static struct cb_ops zfs_cb_ops = {
+ zfsdev_open, /* open */
+ zfsdev_close, /* close */
+ zvol_strategy, /* strategy */
+ nodev, /* print */
+ zvol_dump, /* dump */
+ zvol_read, /* read */
+ zvol_write, /* write */
+ zfsdev_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
+ CB_REV, /* version */
+ nodev, /* async read */
+ nodev, /* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+ DEVO_REV, /* version */
+ 0, /* refcnt */
+ zfs_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ zfs_attach, /* attach */
+ zfs_detach, /* detach */
+ nodev, /* reset */
+ &zfs_cb_ops, /* driver operations */
+ NULL, /* no bus operations */
+ NULL, /* power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv zfs_modldrv = {
+ &mod_driverops,
+ "ZFS storage pool",
+ &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&zfs_modlfs,
+ (void *)&zfs_modldrv,
+ NULL
+};
+#endif /* sun */
+
static struct cdevsw zfs_cdevsw = {
.d_version = D_VERSION,
+ .d_open = zfsdev_open,
.d_ioctl = zfsdev_ioctl,
.d_name = ZFS_DEV_NAME
};
@@ -3716,6 +5139,69 @@ struct proc *zfsproc;
uint_t zfs_fsyncer_key;
extern uint_t rrw_tsd_key;
+#ifdef sun
+int
+_init(void)
+{
+ int error;
+
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ return (error);
+ }
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, NULL);
+
+ error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+ ASSERT(error == 0);
+ mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
+ return (EBUSY);
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ if (zfs_nfsshare_inited)
+ (void) ddi_modclose(nfs_mod);
+ if (zfs_smbshare_inited)
+ (void) ddi_modclose(smbsrv_mod);
+ if (zfs_nfsshare_inited || zfs_smbshare_inited)
+ (void) ddi_modclose(sharefs_mod);
+
+ tsd_destroy(&zfs_fsyncer_key);
+ ldi_ident_release(zfs_li);
+ zfs_li = NULL;
+ mutex_destroy(&zfs_share_lock);
+
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+#endif /* sun */
+
static int
zfs_modevent(module_t mod, int type, void *unused __unused)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index 310508875347..29378d8e71fc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -44,14 +43,6 @@
#include <sys/zfs_fuid.h>
#include <sys/dsl_dataset.h>
-#define ZFS_HANDLE_REPLAY(zilog, tx) \
- if (zilog->zl_replay) { \
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
- zilog->zl_replaying_seq; \
- return; \
- }
-
/*
* These zfs_log_* functions must be called within a dmu tx, in one
* of 2 contexts depending on zilog->z_replay:
@@ -180,6 +171,15 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+ XAT0_REPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ XAT0_OFFLINE;
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ XAT0_SPARSE;
}
static void *
@@ -241,7 +241,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
zfs_fuid_info_t *fuidp, vattr_t *vap)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
lr_acl_create_t *lracl;
size_t aclsize;
@@ -253,11 +252,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
size_t namesize = strlen(name) + 1;
size_t fuidsz = 0;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If we have FUIDs present then add in space for
* domains and ACE fuid's if any.
@@ -288,21 +285,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
- lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+ lr->lr_mode = zp->z_mode;
+ if (!IS_EPHEMERAL(zp->z_uid)) {
+ lr->lr_uid = (uint64_t)zp->z_uid;
} else {
lr->lr_uid = fuidp->z_fuid_owner;
}
- if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
- lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+ if (!IS_EPHEMERAL(zp->z_gid)) {
+ lr->lr_gid = (uint64_t)zp->z_gid;
} else {
lr->lr_gid = fuidp->z_fuid_group;
}
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- lr->lr_rdev = zp->z_phys->zp_rdev;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+ sizeof (lr->lr_rdev)) != 0)
+ lr->lr_rdev = 0;
/*
* Fill in xvattr info if any
@@ -341,9 +342,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
bcopy(name, end, namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -351,25 +350,23 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name)
+ znode_t *dzp, char *name, uint64_t foid)
{
itx_t *itx;
- uint64_t seq;
lr_remove_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
+ itx->itx_oid = foid;
+
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -380,24 +377,19 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
- uint64_t seq;
lr_link_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_link_obj = zp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -408,32 +400,28 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
size_t namesize = strlen(name) + 1;
size_t linksize = strlen(link) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_uid = zp->z_uid;
+ lr->lr_gid = zp->z_gid;
+ lr->lr_mode = zp->z_mode;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
bcopy(name, (char *)(lr + 1), namesize);
bcopy(link, (char *)(lr + 1) + namesize, linksize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -444,27 +432,22 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
- uint64_t seq;
lr_rename_t *lr;
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
lr->lr_tdoid = tdzp->z_id;
bcopy(sname, (char *)(lr + 1), snamesize);
bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ itx->itx_oid = szp->z_id;
- seq = zil_itx_assign(zilog, itx, tx);
- sdzp->z_last_itx = seq;
- tdzp->z_last_itx = seq;
- szp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -472,9 +455,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
ssize_t zfs_immediate_write_sz = 32768;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
- sizeof (lr_write_t))
-
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -482,37 +462,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t write_state;
boolean_t slogging;
uintptr_t fsync_cnt;
+ ssize_t immediate_write_sz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ ? 0 : zfs_immediate_write_sz;
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * In this mode, if we need to commit the write later, then the block
- * is immediately written into the file system (using dmu_sync),
- * and a pointer to the block is put into the log record.
- * When the txg commits the block is linked in.
- * This saves additionally writing the data into the log record.
- * There are a few requirements for this to occur:
- * - write is greater than zfs_immediate_write_sz
- * - not using slogs (as slogs are assumed to always be faster
- * than writing into the main pool)
- * - the write occupies only one block
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FSYNC or FDSYNC), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
- slogging = spa_has_slogs(zilog->zl_spa);
- if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
+ slogging = spa_has_slogs(zilog->zl_spa) &&
+ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
@@ -541,8 +501,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
+ zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
@@ -559,13 +518,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx->itx_private = zp->z_zfsvfs;
- if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
- (ioflag & (FSYNC | FDSYNC)))
- itx->itx_sync = B_TRUE;
- else
+ if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+ (fsync_cnt == 0))
itx->itx_sync = B_FALSE;
- zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+ zil_itx_assign(zilog, itx, tx);
off += len;
resid -= len;
@@ -580,14 +537,11 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
- uint64_t seq;
lr_truncate_t *lr;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_truncate_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
@@ -595,8 +549,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
lr->lr_length = len;
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -607,18 +560,14 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_setattr_t *lr;
xvattr_t *xvap = (xvattr_t *)vap;
size_t recsize = sizeof (lr_setattr_t);
void *start;
-
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If XVATTR set, then log record size needs to allow
* for lr_attr_t + xvattr mask, mapsize and create time
@@ -662,8 +611,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(void) zfs_log_fuid_domains(fuidp, start);
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -674,7 +622,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_acl_v0_t *lrv0;
lr_acl_t *lr;
int txtype;
@@ -682,11 +629,9 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
size_t txsize;
size_t aclbytes = vsecp->vsa_aclentsz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
TX_ACL_V0 : TX_ACL;
@@ -732,6 +677,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
}
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
new file mode 100644
index 000000000000..ca0acfd3206d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls. User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation. Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+ zfs_onexit_t *zo;
+
+ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+ offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+ zfs_onexit_action_node_t *ap;
+
+ mutex_enter(&zo->zo_lock);
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ mutex_enter(&zo->zo_lock);
+ }
+ mutex_exit(&zo->zo_lock);
+
+ list_destroy(&zo->zo_actions);
+ mutex_destroy(&zo->zo_lock);
+ kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (*zo == NULL)
+ return (EBADF);
+
+ return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ file_t *fp, *tmpfp;
+ zfs_onexit_t *zo;
+ void *data;
+ int error;
+
+ fp = getf(fd);
+ if (fp == NULL)
+ return (EBADF);
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ if (error == 0)
+ *minorp = (minor_t)(uintptr_t)data;
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (error);
+ return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+ list_link_init(&ap->za_link);
+ ap->za_func = func;
+ ap->za_data = data;
+
+ mutex_enter(&zo->zo_lock);
+ list_insert_tail(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (action_handle)
+ *action_handle = (uint64_t)(uintptr_t)ap;
+
+ return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+ zfs_onexit_action_node_t *match;
+ zfs_onexit_action_node_t *ap;
+ list_t *l;
+
+ ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+ match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+ l = &zo->zo_actions;
+ for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+ if (match == ap)
+ break;
+ }
+ return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (fire)
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ } else {
+ mutex_exit(&zo->zo_lock);
+ error = ENOENT;
+ }
+
+ return (error);
+}
+
+/*
+ * Return the data associated with this callback. This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ *data = NULL;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL)
+ *data = ap->za_data;
+ else
+ error = ENOENT;
+ mutex_exit(&zo->zo_lock);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index c96524726f13..ebea17a3a32a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -132,6 +129,12 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
}
static int
@@ -516,7 +519,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
break;
case TX_MKXATTR:
- name = (char *)(lr + 1);
error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
break;
case TX_SYMLINK:
@@ -531,10 +533,8 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
VOP_UNLOCK(ZTOV(dzp), 0);
out:
- if (error == 0 && vp != NULL) {
- VOP_UNLOCK(vp, 0);
- VN_RELE(vp);
- }
+ if (error == 0 && vp != NULL)
+ VN_URELE(vp);
VN_RELE(ZTOV(dzp));
@@ -588,6 +588,7 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
}
vput(vp);
VOP_UNLOCK(ZTOV(dzp), 0);
+
fail:
VN_RELE(ZTOV(dzp));
@@ -616,6 +617,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
+
cn.cn_nameptr = name;
cn.cn_cred = kcred;
cn.cn_thread = curthread;
@@ -710,7 +712,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
znode_t *zp;
int error;
ssize_t resid;
- uint64_t orig_eof, eod;
+ uint64_t eod, offset, length;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -725,15 +727,10 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
error = 0;
return (error);
}
- orig_eof = zp->z_phys->zp_size;
- eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
-
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(zfsvfs->z_log, lr);
- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
- lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+ eod = offset + length; /* end of data for this write */
/*
* This may be a write from a dmu_sync() for a whole block,
@@ -741,12 +738,29 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
* We can't just replay what was written for this TX_WRITE as
* a future TX_WRITE2 may extend the eof and the data for that
* write needs to be there. So we write the whole block and
- * reduce the eof.
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
*/
- if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
- zp->z_phys->zp_size = eod;
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
+ }
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+ UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
VN_RELE(ZTOV(zp));
+ zfsvfs->z_replay_eof = 0; /* safety */
return (error);
}
@@ -767,21 +781,34 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log writes out of order, it's possible the
- * file has been removed. In this case just drop the write
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
+top:
end = lr->lr_offset + lr->lr_length;
- if (end > zp->z_phys->zp_size) {
- ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
- zp->z_phys->zp_size = end;
+ if (end > zp->z_size) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
}
VN_RELE(ZTOV(zp));
@@ -792,9 +819,33 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
static int
zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
{
+#ifdef sun
+ znode_t *zp;
+ flock64_t fl;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&fl, sizeof (fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = 0;
+ fl.l_start = lr->lr_offset;
+ fl.l_len = lr->lr_length;
+ error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+ lr->lr_offset, kcred, NULL);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+#else /* !sun */
ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
return (EOPNOTSUPP);
+#endif /* !sun */
}
static int
@@ -816,16 +867,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log setattrs out of order, it's possible the
- * file has been removed. In this case just drop the setattr
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -874,16 +917,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -935,16 +970,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
}
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
#ifdef TODO
bzero(&vsa, sizeof (vsa));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index 4de8d8a2dfed..7fd8f6020d08 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -112,7 +112,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
- * a z_phys or z_zfsvfs - so skip that processing.
+ * a "sa" data or z_zfsvfs - so skip that processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
@@ -125,14 +125,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
+ new->r_off = zp->z_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ end_size = MAX(zp->z_size, new->r_off + len);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
new->r_off = 0;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
new file mode 100644
index 000000000000..d141e43d722a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively. The file system
+ * could have other attributes stored in files, but they will be
+ * ignored. The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+ {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+ {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ size_t bufsz;
+ int error;
+
+ bufsz = zp->z_size;
+ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+ error = uiomove((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+ 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ }
+ return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+ VERIFY(dmu_set_bonus(db,
+ len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ if (len) {
+ bcopy(link, (caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ }
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_grow_blocksize(zp, len, tx);
+ VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+ zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa) {
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp)) != 0)
+ return;
+ } else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+ return;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ if (len <= doi.doi_bonus_size) {
+ (void) memcpy(xoap->xoa_av_scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ sizeof (xoap->xoa_av_scanstamp));
+ }
+ }
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa)
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp), tx));
+ else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+ if (len > doi.doi_bonus_size)
+ VERIFY(dmu_set_bonus(db, len, tx) == 0);
+ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+ zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ &zp->z_pflags, sizeof (uint64_t), tx));
+ }
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to know performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(hdl);
+ znode_t *zp = sa_get_userdata(hdl);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ sa_bulk_attr_t bulk[20];
+ int count = 0;
+ sa_bulk_attr_t sa_attrs[20] = { 0 };
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr, parent;
+ uint64_t crtime[2], mtime[2], ctime[2];
+ zfs_acl_phys_t znode_acl;
+ char scanstamp[AV_SCANSTAMP_SZ];
+ boolean_t drop_lock = B_FALSE;
+
+ /*
+ * No upgrade if ACL isn't cached
+ * since we won't know which locks are held
+ * and ready the ACL would require special "locked"
+ * interfaces that would be messy
+ */
+ if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
+ return;
+
+ /*
+ * If the z_lock is held and we aren't the owner
+ * the just return since we don't want to deadlock
+ * trying to update the status of z_is_sa. This
+ * file can then be upgraded at a later time.
+ *
+ * Otherwise, we know we are doing the
+ * sa_update() that caused us to enter this function.
+ */
+ if (mutex_owner(&zp->z_lock) != curthread) {
+ if (mutex_tryenter(&zp->z_lock) == 0)
+ return;
+ else
+ drop_lock = B_TRUE;
+ }
+
+ /* First do a bulk query of the attributes that aren't cached */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+
+ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+ goto done;
+
+
+ /*
+ * While the order here doesn't matter its best to try and organize
+ * it is such a way to pick up an already existing layout number
+ */
+ count = 0;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+ NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+ if (xattr)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+ NULL, &xattr, 8);
+
+ /* if scanstamp then add scanstamp */
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+ NULL, scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+ count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj)
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+
+ zp->z_is_sa = B_TRUE;
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+ if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+ return;
+
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ if (zfs_external_acl(zp)) {
+ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+ DMU_OBJECT_END);
+ }
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 287de4ce7573..e9a956ce4bb4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -45,6 +46,7 @@
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/varargs.h>
#include <sys/policy.h>
#include <sys/atomic.h>
@@ -55,17 +57,19 @@
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
+#include <sys/sa.h>
+#include "zfs_comutil.h"
struct mtx zfs_debug_mtx;
MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
-int zfs_super_owner = 0;
+int zfs_super_owner;
SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
"File system owner can perform privileged operation on his file systems");
-int zfs_debug_level = 0;
+int zfs_debug_level;
TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
"Debug level");
@@ -74,12 +78,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
static int zfs_version_acl = ZFS_ACL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
"ZFS_ACL_VERSION");
-static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
- &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
-static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
- &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
static int zfs_version_spa = SPA_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
"SPA_VERSION");
@@ -156,9 +154,8 @@ zfs_sync(vfs_t *vfsp, int waitfor)
}
if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
- else
- txg_wait_synced(dp, 0);
+ zil_commit(zfsvfs->z_log, 0);
+
ZFS_EXIT(zfsvfs);
} else {
/*
@@ -172,6 +169,60 @@ zfs_sync(vfs_t *vfsp, int waitfor)
return (0);
}
+#ifndef __FreeBSD__
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+ major_t new_major;
+
+ do {
+ ASSERT3U(zfs_minor, <=, MAXMIN32);
+ minor_t start = zfs_minor;
+ do {
+ mutex_enter(&zfs_dev_mtx);
+ if (zfs_minor >= MAXMIN32) {
+ /*
+ * If we're still using the real major
+ * keep out of /dev/zfs and /dev/zvol minor
+ * number space. If we're using a getudev()'ed
+ * major number, we can use all of its minors.
+ */
+ if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+ zfs_minor = ZFS_MIN_MINOR;
+ else
+ zfs_minor = 0;
+ } else {
+ zfs_minor++;
+ }
+ *dev = makedevice(zfs_major, zfs_minor);
+ mutex_exit(&zfs_dev_mtx);
+ } while (vfs_devismounted(*dev) && zfs_minor != start);
+ if (zfs_minor == start) {
+ /*
+ * We are using all ~262,000 minor numbers for the
+ * current major number. Create a new major number.
+ */
+ if ((new_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_mount: Can't get unique major "
+ "device number.");
+ return (-1);
+ }
+ mutex_enter(&zfs_dev_mtx);
+ zfs_major = new_major;
+ zfs_minor = 0;
+
+ mutex_exit(&zfs_dev_mtx);
+ } else {
+ break;
+ }
+ /* CONSTANTCONDITION */
+ } while (1);
+
+ return (0);
+}
+#endif /* !__FreeBSD__ */
+
static void
atime_changed_cb(void *arg, uint64_t newval)
{
@@ -313,14 +364,6 @@ vscan_changed_cb(void *arg, uint64_t newval)
}
static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_mode = newval;
-}
-
-static void
acl_inherit_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
@@ -335,11 +378,11 @@ zfs_register_callbacks(vfs_t *vfsp)
objset_t *os = NULL;
zfsvfs_t *zfsvfs = NULL;
uint64_t nbmand;
- int readonly, do_readonly = FALSE;
- int setuid, do_setuid = FALSE;
- int exec, do_exec = FALSE;
- int xattr, do_xattr = FALSE;
- int atime, do_atime = FALSE;
+ int readonly, do_readonly = B_FALSE;
+ int setuid, do_setuid = B_FALSE;
+ int exec, do_exec = B_FALSE;
+ int xattr, do_xattr = B_FALSE;
+ int atime, do_atime = B_FALSE;
int error = 0;
ASSERT(vfsp);
@@ -360,7 +403,8 @@ zfs_register_callbacks(vfs_t *vfsp)
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
readonly = B_TRUE;
do_readonly = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
@@ -444,8 +488,6 @@ zfs_register_callbacks(vfs_t *vfsp)
error = error ? error : dsl_prop_register(ds,
"snapdir", snapdir_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
- "aclmode", acl_mode_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
"aclinherit", acl_inherit_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
"vscan", vscan_changed_cb, zfsvfs);
@@ -483,7 +525,6 @@ unregister:
(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
zfsvfs);
(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
@@ -491,62 +532,53 @@ unregister:
}
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
- int64_t delta, dmu_tx_t *tx)
-{
- uint64_t used = 0;
- char buf[32];
- int err;
- uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
- if (delta == 0)
- return;
-
- (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
- err = zap_lookup(os, obj, buf, 8, 1, &used);
- ASSERT(err == 0 || err == ENOENT);
- /* no underflow/overflow */
- ASSERT(delta > 0 || used >= -delta);
- ASSERT(delta < 0 || used + delta > used);
- used += delta;
- if (used == 0)
- err = zap_remove(os, obj, buf, tx);
- else
- err = zap_update(os, obj, buf, 8, 1, &used, tx);
- ASSERT(err == 0);
-}
-
-static void
-zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
- void *oldbonus, void *newbonus,
- uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+ uint64_t *userp, uint64_t *groupp)
{
- znode_phys_t *oldznp = oldbonus;
- znode_phys_t *newznp = newbonus;
+ znode_phys_t *znp = data;
+ int error = 0;
- if (bonustype != DMU_OT_ZNODE)
- return;
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (ENOENT);
- /* We charge 512 for the dnode (if it's allocated). */
- if (oldznp->zp_gen != 0)
- oldused += DNODE_SIZE;
- if (newznp->zp_gen != 0)
- newused += DNODE_SIZE;
+ /*
+ * If we have a NULL data pointer
+ * then assume the id's aren't changing and
+ * return EEXIST to the dmu to let it know to
+ * use the same ids
+ */
+ if (data == NULL)
+ return (EEXIST);
- if (oldznp->zp_uid == newznp->zp_uid) {
- uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+ if (bonustype == DMU_OT_ZNODE) {
+ *userp = znp->zp_uid;
+ *groupp = znp->zp_gid;
} else {
- uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
- uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
- }
+ int hdrsize;
- if (oldznp->zp_gid == newznp->zp_gid) {
- uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
- } else {
- uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
- uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+ ASSERT(bonustype == DMU_OT_SA);
+ hdrsize = sa_hdrsize(data);
+
+ if (hdrsize != 0) {
+ *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_UID_OFFSET));
+ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_GID_OFFSET));
+ } else {
+ /*
+ * This should only happen for newly created
+ * files that haven't had the znode data filled
+ * in yet.
+ */
+ *userp = 0;
+ *groupp = 0;
+ }
}
+ return (error);
}
static void
@@ -733,7 +765,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
}
boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
@@ -756,33 +788,48 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
return (used >= quota);
}
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+ uint64_t fuid;
+ uint64_t quotaobj;
+
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ fuid = isgroup ? zp->z_gid : zp->z_uid;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
int
-zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
objset_t *os;
zfsvfs_t *zfsvfs;
uint64_t zval;
int i, error;
+ uint64_t sa_obj;
- if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
- return (error);
- if (zval)
- mode |= DS_MODE_READONLY;
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
- if (error == EROFS) {
- mode |= DS_MODE_READONLY;
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
- }
- if (error)
+ /*
+ * We claim to always be readonly so we can open snapshots;
+ * other ZPL code will prevent us from writing to snapshots.
+ */
+ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+ if (error) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
+ }
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
*/
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
@@ -792,15 +839,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
if (error) {
goto out;
- } else if (zfsvfs->z_version > ZPL_VERSION) {
- (void) printf("Mismatched versions: File system "
- "is version %llu on-disk format, which is "
- "incompatible with this software version %lld!",
- (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+ } else if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printf("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
error = ENOTSUP;
goto out;
}
-
if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
goto out;
zfsvfs->z_norm = (int)zval;
@@ -822,6 +869,29 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error)
+ return (error);
+ } else {
+ /*
+ * Pre SA versions file systems should never touch
+ * either the attribute registration or layout objects.
+ */
+ sa_obj = 0;
+ }
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error)
+ goto out;
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
@@ -857,7 +927,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
goto out;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
@@ -867,12 +936,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- *zvp = zfsvfs;
+ *zfvp = zfsvfs;
return (0);
out:
- dmu_objset_close(os);
- *zvp = NULL;
+ dmu_objset_disown(os, zfsvfs);
+ *zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
@@ -889,15 +958,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
/*
* Set the objset user_ptr to track its zfsvfs.
*/
- mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
- if (zil_disable) {
- zil_destroy(zfsvfs->z_log, B_FALSE);
- zfsvfs->z_log = NULL;
- }
/*
* If we are not mounting (ie: online recv), then we don't
@@ -917,37 +982,42 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
else
zfs_unlinked_drain(zfsvfs);
- if (zfsvfs->z_log) {
- /*
- * Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest
- * doesn't use readonly mounts, where
- * zfs_unlinked_drain() isn't called.) This is because
- * ziltest causes spa_sync() to think it's committed,
- * but actually it is not, so the intent log contains
- * many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated
- * in a yet later txg. This would write a "create
- * object N" record to the intent log. Normally, this
- * would be fine because the spa_sync() would have
- * written out the fact that object N is free, before
- * we could write the "create object N" intent log
- * record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
- */
- zfsvfs->z_replay = B_TRUE;
- zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
- zfsvfs->z_replay = B_FALSE;
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
@@ -974,7 +1044,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
- mutex_destroy(&zfsvfs->z_online_recv_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrw_destroy(&zfsvfs->z_teardown_lock);
@@ -989,13 +1058,24 @@ static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ if (zfsvfs->z_vfs) {
+ if (zfsvfs->z_use_fuids) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ } else {
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ }
}
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
static int
@@ -1009,7 +1089,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
ASSERT(vfsp);
ASSERT(osname);
- error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+ error = zfsvfs_create(osname, &zfsvfs);
if (error)
return (error);
zfsvfs->z_vfs = vfsp;
@@ -1026,7 +1106,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
-
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
@@ -1053,6 +1132,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
@@ -1063,10 +1143,11 @@ zfs_domount(vfs_t *vfsp, char *osname)
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
- mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
} else {
error = zfsvfs_setup(zfsvfs, B_TRUE);
}
@@ -1080,7 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
zfsctl_create(zfsvfs);
out:
if (error) {
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
} else {
atomic_add_32(&zfs_active_fs_count, 1);
@@ -1121,9 +1202,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
VERIFY(dsl_prop_unregister(ds, "aclinherit",
acl_inherit_changed_cb, zfsvfs) == 0);
@@ -1132,6 +1210,302 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
}
}
+#ifdef SECLABEL
+/*
+ * Convert a decimal digit string to a uint64_t integer.
+ */
+static int
+str_to_uint64(char *str, uint64_t *objnum)
+{
+ uint64_t num = 0;
+
+ while (*str) {
+ if (*str < '0' || *str > '9')
+ return (EINVAL);
+
+ num = num*10 + *str++ - '0';
+ }
+
+ *objnum = num;
+ return (0);
+}
+
+/*
+ * The boot path passed from the boot loader is in the form of
+ * "rootpool-name/root-filesystem-object-number'. Convert this
+ * string to a dataset name: "rootpool-name/root-filesystem-name".
+ */
+static int
+zfs_parse_bootfs(char *bpath, char *outpath)
+{
+ char *slashp;
+ uint64_t objnum;
+ int error;
+
+ if (*bpath == 0 || *bpath == '/')
+ return (EINVAL);
+
+ (void) strcpy(outpath, bpath);
+
+ slashp = strchr(bpath, '/');
+
+ /* if no '/', just return the pool name */
+ if (slashp == NULL) {
+ return (0);
+ }
+
+ /* if not a number, just return the root dataset name */
+ if (str_to_uint64(slashp+1, &objnum)) {
+ return (0);
+ }
+
+ *slashp = '\0';
+ error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
+ *slashp = '/';
+
+ return (error);
+}
+
+/*
+ * zfs_check_global_label:
+ * Check that the hex label string is appropriate for the dataset
+ * being mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (EACCES);
+ return (rdonly ? 0 : EACCES);
+ }
+ return (EACCES);
+}
+
+/*
+ * zfs_mount_label_policy:
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into. If the dataset has
+ * no label, create one.
+ *
+ * Returns:
+ * 0 : access allowed
+ * >0 : error code, such as EACCES
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+ int error, retv;
+ zone_t *mntzone = NULL;
+ ts_label_t *mnt_tsl;
+ bslabel_t *mnt_sl;
+ bslabel_t ds_sl;
+ char ds_hexsl[MAXNAMELEN];
+
+ retv = EACCES; /* assume the worst */
+
+ /*
+ * Start by getting the dataset label if it exists.
+ */
+ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (EACCES);
+
+ /*
+ * If labeling is NOT enabled, then disallow the mount of datasets
+ * which have a non-default label already. No other label checks
+ * are needed.
+ */
+ if (!is_system_labeled()) {
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ return (EACCES);
+ }
+
+ /*
+ * Get the label of the mountpoint. If mounting into the global
+ * zone (i.e. mountpoint is not within an active zone and the
+ * zoned property is off), the label must be default or
+ * admin_low/admin_high only; no other checks are needed.
+ */
+ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+ if (mntzone->zone_id == GLOBAL_ZONEID) {
+ uint64_t zoned;
+
+ zone_rele(mntzone);
+
+ if (dsl_prop_get_integer(osname,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (EACCES);
+ if (!zoned)
+ return (zfs_check_global_label(osname, ds_hexsl));
+ else
+ /*
+ * This is the case of a zone dataset being mounted
+ * initially, before the zone has been fully created;
+ * allow this mount into global zone.
+ */
+ return (0);
+ }
+
+ mnt_tsl = mntzone->zone_slabel;
+ ASSERT(mnt_tsl != NULL);
+ label_hold(mnt_tsl);
+ mnt_sl = label2bslabel(mnt_tsl);
+
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+ /*
+ * The dataset doesn't have a real label, so fabricate one.
+ */
+ char *str = NULL;
+
+ if (l_to_str_internal(mnt_sl, &str) == 0 &&
+ dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+ retv = 0;
+ if (str != NULL)
+ kmem_free(str, strlen(str) + 1);
+ } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+ /*
+ * Now compare labels to complete the MAC check. If the
+ * labels are equal then allow access. If the mountpoint
+ * label dominates the dataset label, allow readonly access.
+ * Otherwise, access is denied.
+ */
+ if (blequal(mnt_sl, &ds_sl))
+ retv = 0;
+ else if (bldominates(mnt_sl, &ds_sl)) {
+ vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+ retv = 0;
+ }
+ }
+
+ label_rele(mnt_tsl);
+ zone_rele(mntzone);
+ return (retv);
+}
+#endif /* SECLABEL */
+
+#ifdef OPENSOLARIS_MOUNTROOT
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+ int error = 0;
+ static int zfsrootdone = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ znode_t *zp = NULL;
+ vnode_t *vp = NULL;
+ char *zfs_bootfs;
+ char *zfs_devid;
+
+ ASSERT(vfsp);
+
+ /*
+ * The filesystem that we mount as root is defined in the
+ * boot property "zfs-bootfs" with a format of
+ * "poolname/root-dataset-objnum".
+ */
+ if (why == ROOT_INIT) {
+ if (zfsrootdone++)
+ return (EBUSY);
+ /*
+ * the process of doing a spa_load will require the
+ * clock to be set before we could (for example) do
+ * something better by looking at the timestamp on
+ * an uberblock, so just set it to -1.
+ */
+ clkset(-1);
+
+ if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
+ cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
+ "bootfs name");
+ return (EINVAL);
+ }
+ zfs_devid = spa_get_bootprop("diskdevid");
+ error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
+ if (zfs_devid)
+ spa_free_bootprop(zfs_devid);
+ if (error) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
+ error);
+ return (error);
+ }
+ if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
+ error);
+ return (error);
+ }
+
+ spa_free_bootprop(zfs_bootfs);
+
+ if (error = vfs_lock(vfsp))
+ return (error);
+
+ if (error = zfs_domount(vfsp, rootfs.bo_name)) {
+ cmn_err(CE_NOTE, "zfs_domount: error %d", error);
+ goto out;
+ }
+
+ zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
+ cmn_err(CE_NOTE, "zfs_zget: error %d", error);
+ goto out;
+ }
+
+ vp = ZTOV(zp);
+ mutex_enter(&vp->v_lock);
+ vp->v_flag |= VROOT;
+ mutex_exit(&vp->v_lock);
+ rootvp = vp;
+
+ /*
+ * Leave rootvp held. The root file system is never unmounted.
+ */
+
+ vfs_add((struct vnode *)0, vfsp,
+ (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+ vfs_unlock(vfsp);
+ return (error);
+ } else if (why == ROOT_REMOUNT) {
+ readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+ vfsp->vfs_flag |= VFS_REMOUNT;
+
+ /* refresh mount options */
+ zfs_unregister_callbacks(vfsp->vfs_data);
+ return (zfs_register_callbacks(vfsp));
+
+ } else if (why == ROOT_UNMOUNT) {
+ zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+ (void) zfs_sync(vfsp, 0, 0);
+ return (0);
+ }
+
+ /*
+ * if "why" is equal to anything else other than ROOT_INIT,
+ * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+ */
+ return (ENOTSUP);
+}
+#endif /* OPENSOLARIS_MOUNTROOT */
+
/*ARGSUSED*/
static int
zfs_mount(vfs_t *vfsp)
@@ -1203,6 +1577,12 @@ zfs_mount(vfs_t *vfsp)
goto out;
}
+#ifdef SECLABEL
+ error = zfs_mount_label_policy(vfsp, osname);
+ if (error)
+ goto out;
+#endif
+
vfsp->vfs_flag |= MNT_NFS4ACLS;
/*
@@ -1291,6 +1671,25 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp)
return (0);
}
+int
+zfs_vnode_lock(vnode_t *vp, int flags)
+{
+ int error;
+
+ ASSERT(vp != NULL);
+
+ /*
+ * Check if the file system wasn't forcibly unmounted in the meantime.
+ */
+ error = vn_lock(vp, flags);
+ if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
+ VOP_UNLOCK(vp, 0);
+ error = ENOENT;
+ }
+
+ return (error);
+}
+
static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
@@ -1301,14 +1700,18 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
ZFS_ENTER_NOERROR(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
ZFS_EXIT(zfsvfs);
if (error == 0) {
- *vpp = ZTOV(rootzp);
- error = vn_lock(*vpp, flags);
- (*vpp)->v_vflag |= VV_ROOT;
+ error = zfs_vnode_lock(*vpp, flags);
+ if (error == 0)
+ (*vpp)->v_vflag |= VV_ROOT;
}
+ if (error != 0)
+ *vpp = NULL;
return (error);
}
@@ -1371,7 +1774,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp))
- if (zp->z_dbuf) {
+ if (zp->z_sa_hdl) {
ASSERT(ZTOV(zp)->v_count >= 0);
zfs_znode_dmu_fini(zp);
}
@@ -1416,10 +1819,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
/*
* Evict cached data
*/
- if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
- }
+ if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
@@ -1440,6 +1843,7 @@ zfs_umount(vfs_t *vfsp, int fflag)
ZFS_DELEG_PERM_MOUNT, cr))
return (ret);
}
+
/*
* We purge the parent filesystem's vfsp as the parent filesystem
* and all of its snapshots have their vnode's v_vfsp set to the
@@ -1525,14 +1929,14 @@ zfs_umount(vfs_t *vfsp, int fflag)
/*
* Unset the objset user_ptr.
*/
- mutex_enter(&os->os->os_user_ptr_lock);
+ mutex_enter(&os->os_user_ptr_lock);
dmu_objset_set_user(os, NULL);
- mutex_exit(&os->os->os_user_ptr_lock);
+ mutex_exit(&os->os_user_ptr_lock);
/*
* Finally release the objset
*/
- dmu_objset_close(os);
+ dmu_objset_disown(os, zfsvfs);
}
/*
@@ -1572,13 +1976,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
VN_RELE(ZTOV(zp));
err = EINVAL;
}
+ if (err == 0)
+ *vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
+ if (err == 0)
+ err = zfs_vnode_lock(*vpp, flags);
if (err != 0)
*vpp = NULL;
- else {
- *vpp = ZTOV(zp);
- vn_lock(*vpp, flags);
- }
return (err);
}
@@ -1611,7 +2015,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
uint64_t fid_gen = 0;
uint64_t gen_mask;
uint64_t zp_gen;
- int i, err;
+ int i, err;
*vpp = NULL;
@@ -1665,8 +2069,10 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
VN_HOLD(*vpp);
}
ZFS_EXIT(zfsvfs);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- return (0);
+ err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
}
gen_mask = -1ULL >> (64 - 8 * i);
@@ -1676,7 +2082,9 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
ZFS_EXIT(zfsvfs);
return (err);
}
- zp_gen = zp->z_phys->zp_gen & gen_mask;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
if (zp_gen == 0)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
@@ -1686,12 +2094,14 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
return (EINVAL);
}
- ZFS_EXIT(zfsvfs);
-
*vpp = ZTOV(zp);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
- return (0);
+ ZFS_EXIT(zfsvfs);
+ err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ if (err == 0)
+ vnode_create_vobject(*vpp, zp->z_size, curthread);
+ else
+ *vpp = NULL;
+ return (err);
}
/*
@@ -1701,17 +2111,13 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
* 'z_teardown_inactive_lock' write held.
*/
int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
-
- *modep = zfsvfs->z_os->os_mode;
- if (name)
- dmu_objset_name(zfsvfs->z_os, name);
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
return (0);
}
@@ -1720,21 +2126,49 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
* Reopen zfsvfs_t::z_os and release VOPs.
*/
int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
{
int err;
ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
- err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
+ &zfsvfs->z_os);
if (err) {
zfsvfs->z_os = NULL;
} else {
znode_t *zp;
+ uint64_t sa_obj = 0;
+
+ /*
+ * Make sure version hasn't changed
+ */
+
+ err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
+ &zfsvfs->z_version);
+
+ if (err)
+ goto bail;
+
+ err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+ if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
+ goto bail;
+
+ if ((err = sa_setup(zfsvfs->z_os, sa_obj,
+ zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0)
+ goto bail;
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(zfsvfs->z_os,
+ zfs_sa_upgrade);
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+ zfs_set_fuid_feature(zfsvfs);
+
/*
* Attempt to re-establish all the active znodes with
* their dbufs. If a zfs_rezget() fails, then we'll let
@@ -1747,17 +2181,17 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
-
}
+bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
if (err) {
/*
- * Since we couldn't reopen zfsvfs::z_os, force
- * unmount this file system.
+ * Since we couldn't reopen zfsvfs::z_os, or
+ * setup the sa framework force unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
@@ -1773,9 +2207,11 @@ zfs_freevfs(vfs_t *vfsp)
#ifdef sun
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
- * from zfs_mount(). Release it here.
+ * from zfs_mount(). Release it here. If we came through
+ * zfs_mountroot() instead, we didn't grab an extra hold, so
+ * skip the VFS_RELE for rootvfs.
*/
- if (zfsvfs->z_issnap)
+ if (zfsvfs->z_issnap && (vfsp != rootvfs))
VFS_RELE(zfsvfs->z_parent->z_vfs);
#endif /* sun */
@@ -1825,17 +2261,17 @@ zfs_init(void)
printf("ZFS filesystem version " ZPL_VERSION_STRING "\n");
/*
- * Initialize znode cache, vnode ops, etc...
+ * Initialize .zfs directory structures
*/
- zfs_znode_init();
+ zfsctl_init();
/*
- * Initialize .zfs directory structures
+ * Initialize znode cache, vnode ops, etc...
*/
- zfsctl_init();
+ zfs_znode_init();
/*
- * Reduce number of vnode. Originally number of vnodes is calculated
+ * Reduce number of vnodes. Originally number of vnodes is calculated
* with UFS inode in mind. We reduce it here, because it's too big for
* ZFS/i386.
*/
@@ -1871,13 +2307,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
if (newvers < zfsvfs->z_version)
return (EINVAL);
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (ENOTSUP);
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
+
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
@@ -1886,20 +2332,35 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (error);
}
- spa_history_internal_log(LOG_DS_UPGRADE,
- dmu_objset_spa(os), tx, CRED(),
- "oldver=%llu newver=%llu dataset = %llu",
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal(LOG_DS_UPGRADE,
+ dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
zfsvfs->z_version, newvers, dmu_objset_id(os));
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
- if (zfsvfs->z_version >= ZPL_VERSION_FUID)
- zfs_set_fuid_feature(zfsvfs);
+ zfs_set_fuid_feature(zfsvfs);
return (0);
}
+
/*
* Read a property stored within the master node.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index f2fdb7a38324..795a7bd216b4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -23,6 +23,7 @@
*/
/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
@@ -47,10 +48,12 @@
#include <sys/zfs_ioctl.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/dirent.h>
#include <sys/policy.h>
#include <sys/sunddi.h>
@@ -58,6 +61,7 @@
#include <sys/sid.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
@@ -122,7 +126,7 @@
* (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors.
*
- * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
* to ensure that synchronous semantics are provided when necessary.
*
* In general, this is how things should be ordered in each vnode op:
@@ -154,7 +158,7 @@
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
- * zil_commit(zilog, seq, foid); // synchronous when necessary
+ * zil_commit(zilog, foid); // synchronous when necessary
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
@@ -169,7 +173,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & FAPPEND) == 0)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
@@ -177,8 +181,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0) {
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
if (fs_vscan(*vpp, cr, 0) != 0) {
ZFS_EXIT(zfsvfs);
return (EACCES);
@@ -216,8 +219,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0)
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
VERIFY(fs_vscan(vp, cr, 1) == 0);
ZFS_EXIT(zfsvfs);
@@ -237,7 +239,7 @@ zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
int error;
boolean_t hole;
- file_sz = zp->z_phys->zp_size;
+ file_sz = zp->z_size;
if (noff >= file_sz) {
return (ENXIO);
}
@@ -370,7 +372,6 @@ zfs_unmap_page(struct sf_buf *sf)
sf_buf_free(sf);
}
-
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
@@ -378,7 +379,6 @@ zfs_unmap_page(struct sf_buf *sf)
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
-
static void
update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
int segflg, dmu_tx_t *tx)
@@ -420,6 +420,71 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
}
/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain VPO_BUSY on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ struct sf_buf *sf;
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(uio->uio_segflg == UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+ ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+ VM_OBJECT_LOCK(obj);
+ for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+ int bytes = MIN(PAGESIZE, len);
+
+ pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
+ if (pp->valid == 0) {
+ vm_page_io_start(pp);
+ VM_OBJECT_UNLOCK(obj);
+ va = zfs_map_page(pp, &sf);
+ error = dmu_read(os, zp->z_id, start, bytes, va,
+ DMU_READ_PREFETCH);
+ if (bytes != PAGESIZE && error == 0)
+ bzero(va + bytes, PAGESIZE - bytes);
+ zfs_unmap_page(sf);
+ VM_OBJECT_LOCK(obj);
+ vm_page_io_finish(pp);
+ vm_page_lock(pp);
+ if (error) {
+ vm_page_free(pp);
+ } else {
+ pp->valid = VM_PAGE_BITS_ALL;
+ vm_page_activate(pp);
+ }
+ vm_page_unlock(pp);
+ }
+ if (error)
+ break;
+ uio->uio_resid -= bytes;
+ uio->uio_offset += bytes;
+ len -= bytes;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
@@ -435,14 +500,11 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
znode_t *zp = VTOZ(vp);
objset_t *os = zp->z_zfsvfs->z_os;
vm_object_t obj;
- vm_page_t m;
- struct sf_buf *sf;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
- uint64_t dirbytes;
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
@@ -450,98 +512,25 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
start = uio->uio_loffset;
off = start & PAGEOFFSET;
- dirbytes = 0;
VM_OBJECT_LOCK(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- int bytes = MIN(PAGESIZE - off, len);
+ vm_page_t pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
-again:
- if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
- vm_page_is_valid(m, off, bytes)) {
- if ((m->oflags & VPO_BUSY) != 0) {
- /*
- * Reference the page before unlocking and
- * sleeping so that the page daemon is less
- * likely to reclaim it.
- */
- vm_page_lock_queues();
- vm_page_flag_set(m, PG_REFERENCED);
- vm_page_sleep(m, "zfsmrb");
- goto again;
- }
+ if (pp = page_lookup(vp, start, off, bytes)) {
+ struct sf_buf *sf;
+ caddr_t va;
- vm_page_busy(m);
VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0)
- uiomove_fromphys(&m, off, bytes, uio);
+ va = zfs_map_page(pp, &sf);
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+ zfs_unmap_page(sf);
VM_OBJECT_LOCK(obj);
- vm_page_wakeup(m);
- } else if (uio->uio_segflg == UIO_NOCOPY) {
- /*
- * The code below is here to make sendfile(2) work
- * correctly with ZFS. As pointed out by ups@
- * sendfile(2) should be changed to use VOP_GETPAGES(),
- * but it pessimize performance of sendfile/UFS, that's
- * why I handle this special case in ZFS code.
- */
- KASSERT(off == 0,
- ("unexpected offset in mappedread for sendfile"));
- if (m != NULL && (m->oflags & VPO_BUSY) != 0) {
- /*
- * Reference the page before unlocking and
- * sleeping so that the page daemon is less
- * likely to reclaim it.
- */
- vm_page_lock_queues();
- vm_page_flag_set(m, PG_REFERENCED);
- vm_page_sleep(m, "zfsmrb");
- goto again;
- } else if (m == NULL) {
- m = vm_page_alloc(obj, OFF_TO_IDX(start),
- VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL);
- if (m == NULL) {
- VM_OBJECT_UNLOCK(obj);
- VM_WAIT;
- VM_OBJECT_LOCK(obj);
- goto again;
- }
- }
- vm_page_io_start(m);
+ page_unlock(pp);
+ } else {
VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0) {
- va = zfs_map_page(m, &sf);
- error = dmu_read(os, zp->z_id, start, bytes, va,
- DMU_READ_PREFETCH);
- if (bytes != PAGE_SIZE)
- bzero(va + bytes, PAGE_SIZE - bytes);
- zfs_unmap_page(sf);
- }
+ error = dmu_read_uio(os, zp->z_id, uio, bytes);
VM_OBJECT_LOCK(obj);
- vm_page_io_finish(m);
- vm_page_lock(m);
- if (error == 0) {
- m->valid = VM_PAGE_BITS_ALL;
- vm_page_activate(m);
- } else
- vm_page_free(m);
- vm_page_unlock(m);
-
- if (error == 0) {
- uio->uio_resid -= bytes;
- uio->uio_offset += bytes;
- }
- } else {
- dirbytes += bytes;
}
len -= bytes;
off = 0;
@@ -549,8 +538,6 @@ again:
break;
}
VM_OBJECT_UNLOCK(obj);
- if (error == 0 && dirbytes > 0)
- error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
return (error);
}
@@ -584,12 +571,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ssize_t n, nbytes;
int error;
rl_t *rl;
+ xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
os = zfsvfs->z_os;
- if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (EACCES);
}
@@ -613,7 +601,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* Check for mandatory locks
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (MANDMODE(zp->z_mode)) {
if (error = chklock(vp, FREAD,
uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
ZFS_EXIT(zfsvfs);
@@ -624,8 +612,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* If we're in FRSYNC mode, sync out this znode before reading it.
*/
- if (ioflag & FRSYNC)
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
/*
* Lock the range against changes.
@@ -636,18 +624,54 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* If we are reading past end-of-file we can skip
* to the end; but we might still need to set atime.
*/
- if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ if (uio->uio_loffset >= zp->z_size) {
error = 0;
goto out;
}
- ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
- n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+ ASSERT(uio->uio_loffset < zp->z_size);
+ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+#ifdef sun
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ (void) dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(vp)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ (void) dmu_xuio_add(xuio,
+ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz), 0, blksz);
+ }
+ }
+ }
+#endif /* sun */
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+#ifdef __FreeBSD__
+ if (uio->uio_segflg == UIO_NOCOPY)
+ error = mappedread_sf(vp, nbytes, uio);
+ else
+#endif /* __FreeBSD__ */
if (vn_has_cached_data(vp))
error = mappedread(vp, nbytes, uio);
else
@@ -661,7 +685,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n -= nbytes;
}
-
out:
zfs_range_unlock(rl);
@@ -671,53 +694,6 @@ out:
}
/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
- struct iovec *iov;
- ulong_t cnt, incr;
- caddr_t p;
-
- if (uio->uio_segflg != UIO_USERSPACE)
- return;
-
- iov = uio->uio_iov;
-
- while (n) {
- cnt = MIN(iov->iov_len, n);
- if (cnt == 0) {
- /* empty iov entry */
- iov++;
- continue;
- }
- n -= cnt;
- /*
- * touch each page in this segment.
- */
- p = iov->iov_base;
- while (cnt) {
- if (fubyte(p) == -1)
- return;
- incr = MIN(cnt, PAGESIZE);
- p += incr;
- cnt -= incr;
- }
- /*
- * touch the last byte in case it straddles a page.
- */
- p--;
- if (fubyte(p) == -1)
- return;
- iov++;
- }
-}
-
-/*
* Write the bytes to a file.
*
* IN: vp - vnode of file to be written to.
@@ -735,6 +711,7 @@ zfs_prefault_write(ssize_t n, struct uio *uio)
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
*/
+
/* ARGSUSED */
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -751,9 +728,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ssize_t n, nbytes;
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
- uint64_t pflags;
int error;
arc_buf_t *abuf;
+ iovec_t *aiov;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
+ int count = 0;
+ sa_bulk_attr_t bulk[4];
+ uint64_t mtime[2], ctime[2];
/*
* Fasttrack empty write
@@ -768,13 +753,19 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
/*
* If immutable or not appending then return EPERM
*/
- pflags = zp->z_phys->zp_flags;
- if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_phys->zp_size))) {
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -782,44 +773,61 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
zilog = zfsvfs->z_log;
/*
+ * Validate file offset
+ */
+ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Check for mandatory locks before calling zfs_range_lock()
+ * in order to prevent a deadlock with locks set via fcntl().
+ */
+ if (MANDMODE((mode_t)zp->z_mode) &&
+ (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef sun
+ /*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
*/
- zfs_prefault_write(n, uio);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif /* sun */
/*
* If in append mode, set the io offset pointer to eof.
*/
if (ioflag & FAPPEND) {
/*
- * Range lock for a file append:
- * The value for the start of range will be determined by
- * zfs_range_lock() (to guarantee append semantics).
- * If this write will cause the block size to increase,
- * zfs_range_lock() will lock the entire file, so we must
- * later reduce the range after we grow the block size.
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+ woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
- /* overlocked, zp_size can't change */
- woff = uio->uio_loffset = zp->z_phys->zp_size;
- } else {
- woff = uio->uio_loffset = rl->r_off;
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
}
+ uio->uio_loffset = woff;
} else {
- woff = uio->uio_loffset;
/*
- * Validate file offset
- */
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * If we need to grow the block size then zfs_range_lock()
- * will lock a wider range than we request here.
- * Later after growing the block size we reduce the range.
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
}
@@ -833,16 +841,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
- (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- end_size = MAX(zp->z_phys->zp_size, woff + n);
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_size);
+
+ end_size = MAX(zp->z_size, woff + n);
/*
* Write the file in reasonable size chunks. Each chunk is written
@@ -852,31 +854,41 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
-
again:
- if (zfs_usergroup_overquota(zfsvfs,
- B_FALSE, zp->z_phys->zp_uid) ||
- zfs_usergroup_overquota(zfsvfs,
- B_TRUE, zp->z_phys->zp_gid)) {
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = EDQUOT;
break;
}
- /*
- * If dmu_assign_arcbuf() is expected to execute with minimum
- * overhead loan an arc buffer and copy user data to it before
- * we enter a txg. This avoids holding a txg forever while we
- * pagefault on a hanging NFS server mapping.
- */
- if (abuf == NULL && n >= max_blksz &&
- woff >= zp->z_phys->zp_size &&
+ if (xuio && abuf == NULL) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (abuf == NULL && n >= max_blksz &&
+ woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
size_t cbytes;
- abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
@@ -891,8 +903,9 @@ again:
* Start a transaction.
*/
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -931,22 +944,38 @@ again:
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
- if (woff + nbytes > zp->z_phys->zp_size)
+ if (woff + nbytes > zp->z_size)
vnode_pager_setsize(vp, woff + nbytes);
if (abuf == NULL) {
tx_bytes = uio->uio_resid;
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
- nbytes, tx);
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
- ASSERT(tx_bytes == max_blksz);
- dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+ woff, abuf, tx);
+ }
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
-
if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
zp->z_id, uio->uio_segflg, tx);
@@ -957,6 +986,8 @@ again:
* partial progress, update the znode and ZIL accordingly.
*/
if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
dmu_tx_commit(tx);
ASSERT(error != 0);
break;
@@ -974,29 +1005,41 @@ again:
* user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
- if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
(S_IXUSR >> 6))) != 0 &&
- (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(vp, cr,
- (zp->z_phys->zp_mode & S_ISUID) != 0 &&
- zp->z_phys->zp_uid == 0) != 0) {
- zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
}
mutex_exit(&zp->z_acl_lock);
- /*
- * Update time stamp. NOTE: This marks the bonus buffer as
- * dirty, so we don't have to do it again for zp_size.
- */
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
/*
* Update the file size (zp_size) if it has changed;
* account for possible concurrent updates.
*/
- while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
- (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
+ ASSERT(error == 0);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
@@ -1004,6 +1047,11 @@ again:
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
+
+#ifdef sun
+ if (!xuio && n > 0)
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif /* sun */
}
zfs_range_unlock(rl);
@@ -1017,31 +1065,36 @@ again:
return (error);
}
- if (ioflag & (FSYNC | FDSYNC))
- zil_commit(zilog, zp->z_last_itx, zp->z_id);
+ if (ioflag & (FSYNC | FDSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
}
void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
- vnode_t *vp = ZTOV(rl->r_zp);
- objset_t *os = rl->r_zp->z_zfsvfs->z_os;
+ znode_t *zp = zgd->zgd_private;
+ objset_t *os = zp->z_zfsvfs->z_os;
int vfslocked;
- vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_range_unlock(zgd->zgd_rl);
+
+ vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
- VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
kmem_free(zgd, sizeof (zgd_t));
VFS_UNLOCK_GIANT(vfslocked);
}
@@ -1059,20 +1112,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
- uint64_t off = lr->lr_offset;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- int dlen = lr->lr_length; /* length of user data */
int error = 0;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
/*
* Nothing to do if the file has been removed
*/
- if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
return (ENOENT);
if (zp->z_unlinked) {
/*
@@ -1084,6 +1138,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
return (ENOENT);
}
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_private = zp;
+
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
@@ -1092,17 +1150,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- rl = zfs_range_lock(zp, off, dlen, RL_READER);
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (offset >= zp->z_size) {
error = ENOENT;
- goto out;
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
}
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
- DMU_READ_NO_PREFETCH));
+ ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
- uint64_t boff; /* block starting offset */
-
/*
* Have to lock the whole block to ensure when it's
* written out and it's checksum is being calculated
@@ -1110,80 +1167,59 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
- if (ISP2(zp->z_blksz)) {
- boff = P2ALIGN_TYPED(off, zp->z_blksz,
- uint64_t);
- } else {
- boff = 0;
- }
- dlen = zp->z_blksz;
- rl = zfs_range_lock(zp, boff, dlen, RL_READER);
- if (zp->z_blksz == dlen)
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+ RL_READER);
+ if (zp->z_blksz == size)
break;
- zfs_range_unlock(rl);
+ offset += blkoff;
+ zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (lr->lr_offset >= zp->z_size)
error = ENOENT;
- goto out;
- }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_rl = rl;
- zgd->zgd_zilog = zfsvfs->z_log;
- zgd->zgd_bp = &lr->lr_blkptr;
#ifdef DEBUG
if (zil_fault_io) {
error = EIO;
zil_fault_io = 0;
- } else {
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
}
-#else
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
#endif
- if (error != 0) {
- kmem_free(zgd, sizeof (zgd_t));
- goto out;
- }
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
- ASSERT(boff == db->db_offset);
- lr->lr_blkoff = off - boff;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zfs_get_done, zgd);
- ASSERT((error && error != EINPROGRESS) ||
- lr->lr_length <= zp->z_blksz);
if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= zp->z_blksz);
+
/*
- * dmu_sync() can compress a block of zeros to a null
- * blkptr but the block size still needs to be passed
- * through to replay.
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
*/
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
- }
+ if (error == 0)
+ return (0);
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zfs_get_done() callback.
- */
- if (error == EINPROGRESS) {
- return (0);
- } else if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- error = 0;
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ error = 0;
+ }
}
- dmu_buf_rele(db, zgd);
- kmem_free(zgd, sizeof (zgd_t));
}
-out:
- zfs_range_unlock(rl);
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ zfs_get_done(zgd, error);
+
return (error);
}
@@ -1267,7 +1303,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
if (dvp->v_type != VDIR) {
return (ENOTDIR);
- } else if (zdp->z_dbuf == NULL) {
+ } else if (zdp->z_sa_hdl == NULL) {
return (EIO);
}
@@ -1321,7 +1357,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
* We don't allow recursive attributes..
* Maybe someday we will.
*/
- if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ if (zdp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1394,7 +1430,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
VOP_UNLOCK(dvp, 0);
}
ZFS_EXIT(zfsvfs);
- error = vn_lock(*vpp, cnp->cn_lkflags);
+ error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
if (cnp->cn_flags & ISDOTDOT)
vn_lock(dvp, ltype | LK_RETRY);
if (error != 0) {
@@ -1466,8 +1502,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
void *vsecp = NULL;
int flag = 0;
@@ -1481,9 +1518,10 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
+
if (zfsvfs->z_use_fuids == B_FALSE &&
(vsecp || (vap->va_mask & AT_XVATTR) ||
- IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (EINVAL);
ZFS_ENTER(zfsvfs);
@@ -1528,12 +1566,15 @@ top:
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
NULL, NULL);
if (error) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
if (strcmp(name, "..") == 0)
error = EISDIR;
ZFS_EXIT(zfsvfs);
return (error);
}
}
+
if (zp == NULL) {
uint64_t txtype;
@@ -1542,6 +1583,8 @@ top:
* to reference it.
*/
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
goto out;
}
@@ -1549,16 +1592,20 @@ top:
* We only support the creation of regular files in
* extended attribute directories.
*/
- if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
error = EINVAL;
goto out;
}
-
- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
- &acl_ids)) != 0)
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
goto out;
+ have_acl = B_TRUE;
+
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
error = EDQUOT;
@@ -1566,36 +1613,39 @@ top:
}
tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ 0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dl, zp, tx, ZNEW);
-
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE)
txtype |= TX_CI;
@@ -1606,6 +1656,10 @@ top:
} else {
int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ have_acl = B_FALSE;
+
/*
* A directory entry already exists for this name.
*/
@@ -1660,6 +1714,9 @@ out:
error = specvp_check(vpp, cr);
}
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -1680,17 +1737,22 @@ out:
* dvp - ctime|mtime
* vp - ctime (if nlink > 0)
*/
+
+uint64_t null_xattr = 0;
+
/*ARGSUSED*/
static int
zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
int flags)
{
znode_t *zp, *dzp = VTOZ(dvp);
- znode_t *xzp = NULL;
+ znode_t *xzp;
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
+ uint64_t xattr_obj_unlinked = 0;
+ uint64_t obj = 0;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
boolean_t may_delete_now, delete_now = FALSE;
@@ -1712,6 +1774,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
}
top:
+ xattr_obj = 0;
+ xzp = NULL;
/*
* Attempt to lock directory; fail if entry doesn't exist.
*/
@@ -1744,7 +1808,9 @@ top:
else
dnlc_remove(dvp, name);
- may_delete_now = FALSE;
+ VI_LOCK(vp);
+ may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
+ VI_UNLOCK(vp);
/*
* We may delete the znode now, or we may put it in the unlinked set;
@@ -1752,27 +1818,34 @@ top:
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
+ obj = zp->z_id;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
if (may_delete_now) {
toobig =
- zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+ zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
/* if the file is too big, only hold_free a token amount */
dmu_tx_hold_free(tx, zp->z_id, 0,
(toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
}
/* are there any extended attributes? */
- if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
- /* XXX - do we need this if we are deleting? */
- dmu_tx_hold_bonus(tx, xattr_obj);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
- /* are there any additional acls */
- if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
- may_delete_now)
+ mutex_enter(&zp->z_lock);
+ if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ mutex_exit(&zp->z_lock);
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
@@ -1781,6 +1854,8 @@ top:
if (error) {
zfs_dirent_unlock(dl);
VN_RELE(vp);
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
@@ -1803,29 +1878,45 @@ top:
goto out;
}
- if (0 && unlinked) {
+ if (unlinked) {
+
+ /*
+ * Hold z_lock so that we can make sure that the ACL obj
+ * hasn't changed. Could have been deleted due to
+ * zfs_sa_upgrade().
+ */
+ mutex_enter(&zp->z_lock);
VI_LOCK(vp);
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
delete_now = may_delete_now && !toobig &&
vp->v_count == 1 && !vn_has_cached_data(vp) &&
- zp->z_phys->zp_xattr == xattr_obj &&
- zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+ xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
+ acl_obj;
VI_UNLOCK(vp);
}
if (delete_now) {
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT3U(error, ==, 0);
- ASSERT3U(xzp->z_phys->zp_links, ==, 2);
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ if (xattr_obj_unlinked) {
+ ASSERT3U(xzp->z_links, ==, 2);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = 1;
- xzp->z_phys->zp_links = 0;
+ xzp->z_links = 0;
+ error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx);
+ ASSERT3U(error, ==, 0);
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
- zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+
+ if (zp->z_is_sa)
+ error = sa_remove(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), tx);
+ else
+ error = sa_update(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), &null_xattr,
+ sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
}
- mutex_enter(&zp->z_lock);
VI_LOCK(vp);
vp->v_count--;
ASSERT3U(vp->v_count, ==, 0);
@@ -1833,13 +1924,14 @@ top:
mutex_exit(&zp->z_lock);
zfs_znode_delete(zp, tx);
} else if (unlinked) {
+ mutex_exit(&zp->z_lock);
zfs_unlinked_add(zp, tx);
}
txtype = TX_REMOVE;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
@@ -1848,12 +1940,13 @@ out:
zfs_dirent_unlock(dl);
- if (!delete_now) {
+ if (!delete_now)
VN_RELE(vp);
- } else if (xzp) {
- /* this rele is delayed to prevent nesting transactions */
+ if (xzp)
VN_RELE(ZTOV(xzp));
- }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1895,7 +1988,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
@@ -1911,15 +2004,15 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
- IS_EPHEMERAL(crgetgid(cr))))
+ (vsecp || (vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (EINVAL);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
- if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ if (dzp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1932,37 +2025,43 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
if (flags & FIGNORECASE)
zf |= ZCILOOK;
- if (vap->va_mask & AT_XVATTR)
+ if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
+ }
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ vsecp, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
/*
* First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
*/
top:
*vpp = NULL;
if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
NULL, NULL)) {
+ zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
- &acl_ids)) != 0) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
@@ -1979,18 +2078,23 @@ top:
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1999,10 +2103,11 @@ top:
/*
* Create new node.
*/
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
+
/*
* Now put new name in parent dir.
*/
@@ -2017,10 +2122,14 @@ top:
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
+
dmu_tx_commit(tx);
zfs_dirent_unlock(dl);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (0);
}
@@ -2108,8 +2217,10 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
rw_exit(&zp->z_parent_lock);
@@ -2136,7 +2247,7 @@ top:
uint64_t txtype = TX_RMDIR;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
@@ -2151,6 +2262,9 @@ out:
VN_RELE(vp);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -2197,6 +2311,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
zap_attribute_t zap;
uint_t bytes_wanted;
uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
int local_eof;
int outcount;
int error;
@@ -2210,6 +2325,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
/*
* If we are not given an eof variable,
* use a local one.
@@ -2273,8 +2394,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
* Minimum entry size is dirent size and 1 byte for a file name.
*/
ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
- cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
- *cookies = cooks;
+ *cookies = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
*ncookies = ncooks;
}
/*
@@ -2298,7 +2418,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
while (outcount < bytes_wanted) {
ino64_t objnum;
ushort_t reclen;
- off64_t *next;
+ off64_t *next = NULL;
/*
* Special case `.', `..', and `.zfs'.
@@ -2311,7 +2431,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
} else if (offset == 1) {
(void) strcpy(zap.za_name, "..");
zap.za_normalization_conflict = 0;
- objnum = zp->z_phys->zp_parent;
+ objnum = parent;
type = DT_DIR;
} else if (offset == 2 && zfs_show_ctldir(zp)) {
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
@@ -2421,6 +2541,16 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
if (prefetch)
dmu_prefetch(os, objnum, 0, 0);
+ if (ncookies != NULL) {
+ if (cooks == NULL)
+ cooks = *cookies;
+ else {
+ *cooks++ = offset;
+ ncooks--;
+ KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+ }
+ }
+
skip_entry:
/*
* Move to the next entry, fill in the previous offset.
@@ -2431,12 +2561,6 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
} else {
offset += 1;
}
-
- if (cooks != NULL) {
- *cooks++ = offset;
- ncooks--;
- KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
- }
}
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
@@ -2485,10 +2609,12 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
- ZFS_EXIT(zfsvfs);
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
return (0);
}
@@ -2515,26 +2641,38 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp;
int error = 0;
uint32_t blksize;
u_longlong_t nblocks;
uint64_t links;
+ uint64_t mtime[2], ctime[2], crtime[2];
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
- if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
- (pzp->zp_uid != crgetuid(cr))) {
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
@@ -2548,19 +2686,18 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
*/
mutex_enter(&zp->z_lock);
- vap->va_type = IFTOVT(pzp->zp_mode);
- vap->va_mode = pzp->zp_mode & ~S_IFMT;
- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+ vap->va_type = IFTOVT(zp->z_mode);
+ vap->va_mode = zp->z_mode & ~S_IFMT;
// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
vap->va_nodeid = zp->z_id;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
- links = pzp->zp_links + 1;
+ links = zp->z_links + 1;
else
- links = pzp->zp_links;
+ links = zp->z_links;
vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
- vap->va_size = pzp->zp_size;
+ vap->va_size = zp->z_size;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
- vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
+// vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
vap->va_seq = zp->z_seq;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
@@ -2571,110 +2708,114 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
xoap->xoa_archive =
- ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
xoap->xoa_readonly =
- ((pzp->zp_flags & ZFS_READONLY) != 0);
+ ((zp->z_pflags & ZFS_READONLY) != 0);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
xoap->xoa_system =
- ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
xoap->xoa_hidden =
- ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_nounlink =
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
xoap->xoa_immutable =
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_appendonly =
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
xoap->xoa_nodump =
- ((pzp->zp_flags & ZFS_NODUMP) != 0);
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
xoap->xoa_opaque =
- ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
xoap->xoa_av_quarantined =
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
xoap->xoa_av_modified =
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
- vp->v_type == VREG &&
- (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
- /*
- * Only VREG files have anti-virus scanstamps, so we
- * won't conflict with symlinks in the bonus buffer.
- */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len <= doi.doi_bonus_size) {
- /*
- * pzp points to the start of the
- * znode_phys_t. pzp + 1 points to the
- * first byte after the znode_phys_t.
- */
- (void) memcpy(xoap->xoa_av_scanstamp,
- pzp + 1,
- sizeof (xoap->xoa_av_scanstamp));
- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
- }
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
}
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
+ uint64_t times[2];
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ times, sizeof (times));
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
- ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
- ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
mutex_exit(&zp->z_lock);
- dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
vap->va_bytes = nblocks << 9; /* nblocks * 512 */
@@ -2713,7 +2854,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
- znode_phys_t *pzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
@@ -2725,15 +2865,19 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
- int err;
+ int err, err2;
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
- zfs_acl_t *aclp = NULL;
+ zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- boolean_t fuid_dirtied = B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
if (mask == 0)
return (0);
@@ -2744,7 +2888,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
zilog = zfsvfs->z_log;
/*
@@ -2781,14 +2924,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
/*
* Immutable files can only alter immutable bit and atime
*/
- if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
- if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+ if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -2809,6 +2952,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
top:
attrzp = NULL;
+ aclp = NULL;
/* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
@@ -2844,10 +2988,13 @@ top:
((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
- XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
skipaclchk, cr);
+ }
if (mask & (AT_UID|AT_GID)) {
int idmask = (mask & (AT_UID|AT_GID));
@@ -2860,7 +3007,7 @@ top:
*/
if (!(mask & AT_MODE))
- vap->va_mode = pzp->zp_mode;
+ vap->va_mode = zp->z_mode;
/*
* Take ownership or chgrp to group we are a member of
@@ -2898,7 +3045,7 @@ top:
}
mutex_enter(&zp->z_lock);
- oldva.va_mode = pzp->zp_mode;
+ oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
/*
@@ -2910,7 +3057,7 @@ top:
*/
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
if (xoap->xoa_appendonly !=
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
@@ -2920,7 +3067,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
if (xoap->xoa_nounlink !=
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
@@ -2930,7 +3077,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
@@ -2940,7 +3087,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
- ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
@@ -2950,7 +3097,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
@@ -2962,7 +3109,7 @@ top:
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
@@ -2970,6 +3117,12 @@ top:
}
}
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ mutex_exit(&zp->z_lock);
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
if (need_policy == FALSE &&
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
@@ -3038,79 +3191,89 @@ top:
*/
mask = vap->va_mask;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (mask & AT_MODE) {
- uint64_t pmode = pzp->zp_mode;
-
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
- goto out;
- if (pzp->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL from old V0 format to new V1 */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- pzp->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- }
- }
+ if ((mask & (AT_UID | AT_GID))) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
- if (mask & (AT_UID | AT_GID)) {
- if (pzp->zp_xattr) {
- err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err)
- goto out;
- dmu_tx_hold_bonus(tx, attrzp->z_id);
+ goto out2;
}
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
- if (new_uid != pzp->zp_uid &&
- zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (new_uid != zp->z_uid &&
+ zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
- goto out;
+ goto out2;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
- if (new_gid != pzp->zp_gid &&
- zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (new_gid != zp->z_gid &&
+ zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
- goto out;
+ goto out2;
}
}
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
- FALSE, NULL);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ zfs_acl_chmod_setattr(zp, &aclp, new_mode);
+
+ mutex_enter(&zp->z_lock);
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
} else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
}
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
}
+ mutex_exit(&zp->z_lock);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if ((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
}
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) {
if (err == ERESTART)
@@ -3118,8 +3281,7 @@ top:
goto out;
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
+ count = 0;
/*
* Set each attribute requested.
* We group settings according to the locks they need to acquire.
@@ -3128,47 +3290,108 @@ top:
* updated as a side-effect of calling this function.
*/
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
mutex_enter(&zp->z_lock);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ mutex_enter(&attrzp->z_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = new_uid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_uid = new_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ zp->z_gid = new_gid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = new_gid;
+ }
+ }
+ if (!(mask & AT_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
if (mask & AT_MODE) {
- mutex_enter(&zp->z_acl_lock);
- zp->z_phys->zp_mode = new_mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
+ ASSERT3U((uintptr_t)aclp, !=, 0);
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = aclp;
aclp = NULL;
- mutex_exit(&zp->z_acl_lock);
}
- if (attrzp)
- mutex_enter(&attrzp->z_lock);
- if (mask & AT_UID) {
- pzp->zp_uid = new_uid;
- if (attrzp)
- attrzp->z_phys->zp_uid = new_uid;
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
}
- if (mask & AT_GID) {
- pzp->zp_gid = new_gid;
- if (attrzp)
- attrzp->z_phys->zp_gid = new_gid;
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
}
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
-
- if (mask & AT_ATIME)
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
- if (mask & AT_MTIME)
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
- if (mask & AT_SIZE)
- zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
- else if (mask != 0)
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime, B_TRUE);
+ }
+ }
/*
* Do this after setting timestamps to prevent timestamp
* update from toggling bit
@@ -3200,20 +3423,10 @@ top:
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
ASSERT(vp->v_type == VREG);
- /* Grow the bonus buffer if necessary. */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len > doi.doi_bonus_size)
- VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
- }
- zfs_xvattr_set(zp, xvap);
+ zfs_xvattr_set(zp, xvap, tx);
}
if (fuid_dirtied)
@@ -3223,11 +3436,23 @@ top:
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
mutex_exit(&zp->z_lock);
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ mutex_exit(&attrzp->z_lock);
+ }
out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
if (attrzp)
VN_RELE(ZTOV(attrzp));
-
if (aclp)
zfs_acl_free(aclp);
@@ -3236,13 +3461,18 @@ out:
fuidp = NULL;
}
- if (err)
+ if (err) {
dmu_tx_abort(tx);
- else
+ if (err == ERESTART)
+ goto top;
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
+ }
- if (err == ERESTART)
- goto top;
+out2:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (err);
@@ -3283,7 +3513,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zfs_zlock_t *zl;
znode_t *zp = tdzp;
uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t *oidp = &zp->z_id;
+ uint64_t oidp = zp->z_id;
krwlock_t *rwlp = &szp->z_parent_lock;
krw_t rw = RW_WRITER;
@@ -3305,7 +3535,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zfs_rename_unlock(&zl);
*zlpp = NULL;
zp = tdzp;
- oidp = &zp->z_id;
+ oidp = zp->z_id;
rwlp = &szp->z_parent_lock;
rw = RW_WRITER;
continue;
@@ -3323,19 +3553,20 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zl->zl_next = *zlpp;
*zlpp = zl;
- if (*oidp == szp->z_id) /* We're a descendant of szp */
+ if (oidp == szp->z_id) /* We're a descendant of szp */
return (EINVAL);
- if (*oidp == rootid) /* We've hit the top */
+ if (oidp == rootid) /* We've hit the top */
return (0);
if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
if (error)
return (error);
zl->zl_znode = zp;
}
- oidp = &zp->z_phys->zp_parent;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
+ &oidp, sizeof (oidp));
rwlp = &zp->z_parent_lock;
rw = RW_READER;
@@ -3415,8 +3646,7 @@ top:
* by renaming a linked file into/outof an attribute directory.
* See the comment in zfs_link() for why this is considered bad.
*/
- if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
- (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -3517,6 +3747,11 @@ top:
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
+ /*
+ * FreeBSD: In OpenSolaris they only check if rename source is
+ * ".." here, because "." is handled in their lookup. This is
+ * not the case for FreeBSD, so we check for "." explicitly.
+ */
if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
serr = EINVAL;
ZFS_EXIT(zfsvfs);
@@ -3596,14 +3831,20 @@ top:
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
- dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp)
- dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- if (tzp)
- dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
@@ -3634,17 +3875,39 @@ top:
if (error == 0) {
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
if (error == 0) {
- szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+ szp->z_pflags |= ZFS_AV_MODIFIED;
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- ASSERT(error == 0);
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
- zfs_log_rename(zilog, tx,
- TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
- sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME |
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
- /* Update path information for the target vnode */
- vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
+ /*
+ * Update path information for the target vnode
+ */
+ vn_renamepath(tdvp, ZTOV(szp), tnm,
+ strlen(tnm));
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
}
#ifdef FREEBSD_NAMECACHE
if (error == 0) {
@@ -3665,10 +3928,14 @@ out:
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
+
VN_RELE(ZTOV(szp));
if (tzp)
VN_RELE(ZTOV(tzp));
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
@@ -3701,11 +3968,12 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- int len = strlen(link);
+ uint64_t len = strlen(link);
int error;
int zflg = ZNEW;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
int flags = 0;
ASSERT(vap->va_type == VLNK);
@@ -3721,27 +3989,35 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
}
if (flags & FIGNORECASE)
zflg |= ZCILOOK;
-top:
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
return (ENAMETOOLONG);
}
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+top:
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
- VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
@@ -3751,71 +4027,59 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
/*
* Create a new object for the symlink.
- * Put the link content into bonus buffer if it will fit;
- * otherwise, store it just like any other file data.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
*/
- if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
- if (len != 0)
- bcopy(link, zp->z_phys + 1, len);
- } else {
- dmu_buf_t *dbp;
-
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
- /*
- * Nothing can access the znode yet so no locking needed
- * for growing the znode's blocksize.
- */
- zfs_grow_blocksize(zp, len, tx);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
- VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
- zp->z_id, 0, FTAG, &dbp));
- dmu_buf_will_dirty(dbp, tx);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
- ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp, FTAG);
- }
- zp->z_phys->zp_size = len;
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+ mutex_exit(&zp->z_lock);
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
/*
* Insert the new object into the directory.
*/
(void) zfs_link_create(dl, zp, tx, ZNEW);
- if (error == 0) {
- uint64_t txtype = TX_SYMLINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
- *vpp = ZTOV(zp);
- }
+
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ *vpp = ZTOV(zp);
zfs_acl_ids_free(&acl_ids);
@@ -3823,6 +4087,9 @@ top:
zfs_dirent_unlock(dl);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3850,29 +4117,21 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- size_t bufsz;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- bufsz = (size_t)zp->z_phys->zp_size;
- if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
- error = uiomove(zp->z_phys + 1,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- } else {
- dmu_buf_t *dbp;
- error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- error = uiomove(dbp->db_data,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp, FTAG);
- }
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+ mutex_exit(&zp->z_lock);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3938,7 +4197,12 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
/* Prevent links to .zfs/shares files */
- if (szp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -3957,16 +4221,14 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
* into "normal" file space in order to circumvent restrictions
* imposed in attribute space.
*/
- if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
- (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
- owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
- if (owner != crgetuid(cr) &&
- secpolicy_basic_link(svp, cr) != 0) {
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -3987,8 +4249,10 @@ top:
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
@@ -4019,9 +4283,250 @@ top:
vnevent_link(svp, ct);
}
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef sun
+/*
+ * zfs_null_putapage() is used when the file system has been force
+ * unmounted. It just drops the pages.
+ */
+/* ARGSUSED */
+static int
+zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
+ return (0);
+}
+
+/*
+ * Push a page out to disk, klustering if possible.
+ *
+ * IN: vp - file to push page to.
+ * pp - page to push.
+ * flags - additional flags.
+ * cr - credentials of caller.
+ *
+ * OUT: offp - start of range pushed.
+ * lenp - len of range pushed.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * NOTE: callers must have locked the page to be pushed. On
+ * exit, the page (and all other pages in the kluster) must be
+ * unlocked.
+ */
+/* ARGSUSED */
+static int
+zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_tx_t *tx;
+ u_offset_t off, koff;
+ size_t len, klen;
+ int err;
+
+ off = pp->p_offset;
+ len = PAGESIZE;
+ /*
+ * If our blocksize is bigger than the page size, try to kluster
+ * multiple pages so that we write a full block (thus avoiding
+ * a read-modify-write).
+ */
+ if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
+ klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+ koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
+ ASSERT(koff <= zp->z_size);
+ if (koff + klen > zp->z_size)
+ klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
+ pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
+ }
+ ASSERT3U(btop(len), ==, btopr(len));
+
+ /*
+ * Can't push pages past end-of-file.
+ */
+ if (off >= zp->z_size) {
+ /* ignore all pages */
+ err = 0;
+ goto out;
+ } else if (off + len > zp->z_size) {
+ int npages = btopr(zp->z_size - off);
+ page_t *trunc;
+
+ page_list_break(&pp, &trunc, npages);
+ /* ignore pages past end of file */
+ if (trunc)
+ pvn_write_done(trunc, flags);
+ len = zp->z_size - off;
+ }
+
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ err = EDQUOT;
+ goto out;
+ }
+top:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (err != 0) {
+ if (err == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz <= PAGESIZE) {
+ caddr_t va = zfs_map_page(pp, S_READ);
+ ASSERT3U(len, <=, PAGESIZE);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+ zfs_unmap_page(pp, va);
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
+ if (offp)
+ *offp = off;
+ if (lenp)
+ *lenp = len;
+
+ return (err);
+}
+
+/*
+ * Copy the portion of the file indicated from pages into the file.
+ * The pages are stored in a page list attached to the files vnode.
+ *
+ * IN: vp - vnode of file to push page data to.
+ * off - position in file to put data.
+ * len - amount of data to write.
+ * flags - flags to control the operation.
+ * cr - credentials of caller.
+ * ct - caller context.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t *pp;
+ size_t io_len;
+ u_offset_t io_off;
+ uint_t blksz;
+ rl_t *rl;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * Align this request to the file block size in case we kluster.
+ * XXX - this can result in pretty aggresive locking, which can
+ * impact simultanious read/write access. One option might be
+ * to break up long requests (len == 0) into block-by-block
+ * operations to get narrower locking.
+ */
+ blksz = zp->z_blksz;
+ if (ISP2(blksz))
+ io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
+ else
+ io_off = 0;
+ if (len > 0 && ISP2(blksz))
+ io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
+ else
+ io_len = 0;
+
+ if (io_len == 0) {
+ /*
+ * Search the entire vp list for pages >= io_off.
+ */
+ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
+ error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
+ goto out;
+ }
+ rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
+
+ if (off > zp->z_size) {
+ /* past end of file */
+ zfs_range_unlock(rl);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
+
+ for (off = io_off; io_off < off + len; io_off += io_len) {
+ if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
+ pp = page_lookup(vp, io_off,
+ (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
+ } else {
+ pp = page_lookup_nowait(vp, io_off,
+ (flags & B_FREE) ? SE_EXCL : SE_SHARED);
+ }
+
+ if (pp != NULL && pvn_getdirty(pp, flags)) {
+ int err;
+
+ /*
+ * Found a dirty page to push
+ */
+ err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
+ if (err)
+ error = err;
+ } else {
+ io_len = PAGESIZE;
+ }
+ }
+out:
+ zfs_range_unlock(rl);
+ if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
return (error);
}
+#endif /* sun */
/*ARGSUSED*/
void
@@ -4032,13 +4537,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
int error;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf == NULL) {
+ if (zp->z_sa_hdl == NULL) {
/*
* The fs has been unmounted, or we did a
* suspend/resume and this file no longer exists.
*/
VI_LOCK(vp);
- vp->v_count = 0; /* count arrives as 1 */
+ ASSERT(vp->v_count <= 1);
+ vp->v_count = 0;
VI_UNLOCK(vp);
vrecycle(vp, curthread);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
@@ -4048,13 +4554,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
@@ -4065,6 +4573,431 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
+#ifdef sun
+/*
+ * Bounds-check the seek operation.
+ *
+ * IN: vp - vnode seeking within
+ * ooff - old file offset
+ * noffp - pointer to new file offset
+ * ct - caller context
+ *
+ * RETURN: 0 if success
+ * EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+static int
+zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ if (vp->v_type == VDIR)
+ return (0);
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Pre-filter the generic locking function to trap attempts to place
+ * a mandatory lock on a memory mapped file.
+ */
+static int
+zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+ flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * We are following the UFS semantics with respect to mapcnt
+ * here: If we see that the file is mapped already, then we will
+ * return an error, but we don't worry about races between this
+ * function and zfs_map().
+ */
+ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+}
+
+/*
+ * If we can't find a page in the cache, we will create a new page
+ * and fill it with file data. For efficiency, we may try to fill
+ * multiple pages at once (klustering) to fill up the supplied page
+ * list. Note that the pages to be filled are held with an exclusive
+ * lock to prevent access by other threads while they are being filled.
+ */
+static int
+zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
+ caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+{
+ znode_t *zp = VTOZ(vp);
+ page_t *pp, *cur_pp;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ u_offset_t io_off, total;
+ size_t io_len;
+ int err;
+
+ if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
+ /*
+ * We only have a single page, don't bother klustering
+ */
+ io_off = off;
+ io_len = PAGESIZE;
+ pp = page_create_va(vp, io_off, io_len,
+ PG_EXCL | PG_WAIT, seg, addr);
+ } else {
+ /*
+ * Try to find enough pages to fill the page list
+ */
+ pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
+ &io_len, off, plsz, 0);
+ }
+ if (pp == NULL) {
+ /*
+ * The page already exists, nothing to do here.
+ */
+ *pl = NULL;
+ return (0);
+ }
+
+ /*
+ * Fill the pages in the kluster.
+ */
+ cur_pp = pp;
+ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ caddr_t va;
+
+ ASSERT3U(io_off, ==, cur_pp->p_offset);
+ va = zfs_map_page(cur_pp, S_WRITE);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+ DMU_READ_PREFETCH);
+ zfs_unmap_page(cur_pp, va);
+ if (err) {
+ /* On error, toss the entire kluster */
+ pvn_read_done(pp, B_ERROR);
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
+ return (err);
+ }
+ cur_pp = cur_pp->p_next;
+ }
+
+ /*
+ * Fill in the page list array from the kluster starting
+ * from the desired offset `off'.
+ * NOTE: the page list will always be null terminated.
+ */
+ pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+ ASSERT(pl == NULL || (*pl)->p_offset == off);
+
+ return (0);
+}
+
+/*
+ * Return pointers to the pages for the file region [off, off + len]
+ * in the pl array. If plsz is greater than len, this function may
+ * also return page pointers from after the specified region
+ * (i.e. the region [off, off + plsz]). These additional pages are
+ * only returned if they are already in the cache, or were created as
+ * part of a klustered read.
+ *
+ * IN: vp - vnode of file to get data from.
+ * off - position in file to get data from.
+ * len - amount of data to retrieve.
+ * plsz - length of provided page list.
+ * seg - segment to obtain pages for.
+ * addr - virtual address of fault.
+ * rw - mode of created pages.
+ * cr - credentials of caller.
+ * ct - caller context.
+ *
+ * OUT: protp - protection mode of created pages.
+ * pl - list of pages created.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+ enum seg_rw rw, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t **pl0 = pl;
+ int err = 0;
+
+ /* we do our own caching, faultahead is unnecessary */
+ if (pl == NULL)
+ return (0);
+ else if (len > plsz)
+ len = plsz;
+ else
+ len = P2ROUNDUP(len, PAGESIZE);
+ ASSERT(plsz >= len);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (protp)
+ *protp = PROT_ALL;
+
+ /*
+ * Loop through the requested range [off, off + len) looking
+ * for pages. If we don't find a page, we will need to create
+ * a new page and fill it with data from the file.
+ */
+ while (len > 0) {
+ if (*pl = page_lookup(vp, off, SE_SHARED))
+ *(pl+1) = NULL;
+ else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
+ goto out;
+ while (*pl) {
+ ASSERT3U((*pl)->p_offset, ==, off);
+ off += PAGESIZE;
+ addr += PAGESIZE;
+ if (len > 0) {
+ ASSERT3U(len, >=, PAGESIZE);
+ len -= PAGESIZE;
+ }
+ ASSERT3U(plsz, >=, PAGESIZE);
+ plsz -= PAGESIZE;
+ pl++;
+ }
+ }
+
+ /*
+ * Fill out the page array with any pages already in the cache.
+ */
+ while (plsz > 0 &&
+ (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
+ off += PAGESIZE;
+ plsz -= PAGESIZE;
+ }
+out:
+ if (err) {
+ /*
+ * Release any pages we have previously locked.
+ */
+ while (pl > pl0)
+ page_unlock(*--pl);
+ } else {
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ }
+
+ *pl = NULL;
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Request a memory map for a section of a file. This code interacts
+ * with common code and the VM system as follows:
+ *
+ * common code calls mmap(), which ends up in smmap_common()
+ *
+ * this calls VOP_MAP(), which takes you into (say) zfs
+ *
+ * zfs_map() calls as_map(), passing segvn_create() as the callback
+ *
+ * segvn_create() creates the new segment and calls VOP_ADDMAP()
+ *
+ * zfs_addmap() updates z_mapcnt
+ */
+/*ARGSUSED*/
+static int
+zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ segvn_crargs_t vn_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((prot & PROT_WRITE) && (zp->z_pflags &
+ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if ((prot & (PROT_READ | PROT_EXEC)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+ ZFS_EXIT(zfsvfs);
+ return (EACCES);
+ }
+
+ if (vp->v_flag & VNOMAP) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOSYS);
+ }
+
+ if (off < 0 || len > MAXOFFSET_T - off) {
+ ZFS_EXIT(zfsvfs);
+ return (ENXIO);
+ }
+
+ if (vp->v_type != VREG) {
+ ZFS_EXIT(zfsvfs);
+ return (ENODEV);
+ }
+
+ /*
+ * If file is locked, disallow mapping.
+ */
+ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+
+ as_rangelock(as);
+ error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (error != 0) {
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vn_a.vp = vp;
+ vn_a.offset = (u_offset_t)off;
+ vn_a.type = flags & MAP_TYPE;
+ vn_a.prot = prot;
+ vn_a.maxprot = maxprot;
+ vn_a.cred = cr;
+ vn_a.amp = NULL;
+ vn_a.flags = flags & ~MAP_TYPE;
+ vn_a.szc = 0;
+ vn_a.lgrp_mem_policy_flags = 0;
+
+ error = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ uint64_t pages = btopr(len);
+
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
+ return (0);
+}
+
+/*
+ * The reason we push dirty pages as part of zfs_delmap() is so that we get a
+ * more accurate mtime for the associated file. Since we don't have a way of
+ * detecting when the data was actually modified, we have to resort to
+ * heuristics. If an explicit msync() is done, then we mark the mtime when the
+ * last page is pushed. The problem occurs when the msync() call is omitted,
+ * which by far the most common case:
+ *
+ * open()
+ * mmap()
+ * <modify memory>
+ * munmap()
+ * close()
+ * <time lapse>
+ * putpage() via fsflush
+ *
+ * If we wait until fsflush to come along, we can have a modification time that
+ * is some arbitrary point in the future. In order to prevent this in the
+ * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
+ * torn down.
+ */
+/* ARGSUSED */
+static int
+zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ uint64_t pages = btopr(len);
+
+ ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
+
+ if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
+ vn_has_cached_data(vp))
+ (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
+
+ return (0);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: vp - vnode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller [UNUSED].
+ * ct - caller context.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ */
+/* ARGSUSED */
+static int
+zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = convoff(vp, bfp, 0, offset)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* sun */
+
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -4075,13 +5008,21 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint32_t gen;
+ uint64_t gen64;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
- int size, i;
+ int size, i, error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- gen = (uint32_t)zp->z_gen;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
fidp->fid_len = size;
@@ -4134,8 +5075,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
case _PC_FILESIZEBITS:
*valp = 64;
return (0);
-
-#if 0
+#ifdef sun
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
@@ -4158,8 +5098,31 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
}
ZFS_EXIT(zfsvfs);
return (error);
-#endif
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_type == VREG || vp->v_type == VDIR);
+ return (0);
+
+ case _PC_ACCESS_FILTERING:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
+ vp->v_type == VDIR;
+ return (0);
+
+ case _PC_ACL_ENABLED:
+ *valp = _ACL_ACE_ENABLED;
+ return (0);
+#endif /* sun */
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+#ifdef sun
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ return (0);
+#endif /* sun */
case _PC_ACL_EXTENDED:
*valp = 0;
return (0);
@@ -4172,10 +5135,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
*valp = ACL_MAX_ENTRIES;
return (0);
- case _PC_MIN_HOLE_SIZE:
- *valp = (int)SPA_MINBLOCKSIZE;
- return (0);
-
default:
return (EOPNOTSUPP);
}
@@ -4208,14 +5167,350 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
+#ifdef sun
+/*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ * an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int max_blksz = zfsvfs->z_max_blksz;
+ uio_t *uio = &xuio->xu_uio;
+ ssize_t size = uio->uio_resid;
+ offset_t offset = uio->uio_loffset;
+ int blksz;
+ int fullblk, i;
+ arc_buf_t *abuf;
+ ssize_t maxsize;
+ int preamble, postamble;
+
+ if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ switch (ioflag) {
+ case UIO_WRITE:
+ /*
+ * Loan out an arc_buf for write if write size is bigger than
+ * max_blksz, and the file's block size is also max_blksz.
+ */
+ blksz = max_blksz;
+ if (size < blksz || zp->z_blksz != blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ /*
+ * Caller requests buffers for write before knowing where the
+ * write offset might be (e.g. NFS TCP write).
+ */
+ if (offset == -1) {
+ preamble = 0;
+ } else {
+ preamble = P2PHASE(offset, blksz);
+ if (preamble) {
+ preamble = blksz - preamble;
+ size -= preamble;
+ }
+ }
+
+ postamble = P2PHASE(size, blksz);
+ size -= postamble;
+
+ fullblk = size / blksz;
+ (void) dmu_xuio_init(xuio,
+ (preamble != 0) + fullblk + (postamble != 0));
+ DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+ int, postamble, int,
+ (preamble != 0) + fullblk + (postamble != 0));
+
+ /*
+ * Have to fix iov base/len for partial buffers. They
+ * currently represent full arc_buf's.
+ */
+ if (preamble) {
+ /* data begins in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf,
+ blksz - preamble, preamble);
+ }
+
+ for (i = 0; i < fullblk; i++) {
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, blksz);
+ }
+
+ if (postamble) {
+ /* data ends in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, postamble);
+ }
+ break;
+ case UIO_READ:
+ /*
+ * Loan out an arc_buf for read if the read size is larger than
+ * the current file block size. Block alignment is not
+ * considered. Partial arc_buf will be loaned out for read.
+ */
+ blksz = zp->z_blksz;
+ if (blksz < zcr_blksz_min)
+ blksz = zcr_blksz_min;
+ if (blksz > zcr_blksz_max)
+ blksz = zcr_blksz_max;
+ /* avoid potential complexity of dealing with it */
+ if (blksz > max_blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ maxsize = zp->z_size - uio->uio_loffset;
+ if (size > maxsize)
+ size = maxsize;
+
+ if (size < blksz || vn_has_cached_data(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ break;
+ default:
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ uio->uio_extflg = UIO_XUIO;
+ XUIO_XUZC_RW(xuio) = ioflag;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+ int i;
+ arc_buf_t *abuf;
+ int ioflag = XUIO_XUZC_RW(xuio);
+
+ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+ i = dmu_xuio_cnt(xuio);
+ while (i-- > 0) {
+ abuf = dmu_xuio_arcbuf(xuio, i);
+ /*
+ * if abuf == NULL, it must be a write buffer
+ * that has been returned in zfs_write().
+ */
+ if (abuf)
+ dmu_return_arcbuf(abuf);
+ ASSERT(abuf || ioflag == UIO_WRITE);
+ }
+
+ dmu_xuio_fini(xuio);
+ return (0);
+}
+
+/*
+ * Predeclare these here so that the compiler assumes that
+ * this is an "old style" function declaration that does
+ * not include arguments => we won't get type mismatch errors
+ * in the initializations that follow.
+ */
+static int zfs_inval();
+static int zfs_isdir();
+
+static int
+zfs_inval()
+{
+ return (EINVAL);
+}
+
+static int
+zfs_isdir()
+{
+ return (EISDIR);
+}
+/*
+ * Directory vnode operations template
+ */
+vnodeops_t *zfs_dvnodeops;
+const fs_operation_def_t zfs_dvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_READ, { .error = zfs_isdir },
+ VOPNAME_WRITE, { .error = zfs_isdir },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_CREATE, { .vop_create = zfs_create },
+ VOPNAME_REMOVE, { .vop_remove = zfs_remove },
+ VOPNAME_LINK, { .vop_link = zfs_link },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
+ VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
+ VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Regular file vnode operations template
+ */
+vnodeops_t *zfs_fvnodeops;
+const fs_operation_def_t zfs_fvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_READ, { .vop_read = zfs_read },
+ VOPNAME_WRITE, { .vop_write = zfs_write },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
+ VOPNAME_SPACE, { .vop_space = zfs_space },
+ VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
+ VOPNAME_MAP, { .vop_map = zfs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
+ VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
+ NULL, NULL
+};
+
+/*
+ * Symbolic link vnode operations template
+ */
+vnodeops_t *zfs_symvnodeops;
+const fs_operation_def_t zfs_symvnodeops_template[] = {
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * special share hidden files vnode operations template
+ */
+vnodeops_t *zfs_sharevnodeops;
+const fs_operation_def_t zfs_sharevnodeops_template[] = {
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Extended attribute directory vnode operations template
+ * This template is identical to the directory vnodes
+ * operation template except for restricted operations:
+ * VOP_MKDIR()
+ * VOP_SYMLINK()
+ * Note that there are other restrictions embedded in:
+ * zfs_create() - restrict type to VREG
+ * zfs_link() - no links into/out of attribute space
+ * zfs_rename() - no moves into/out of attribute space
+ */
+vnodeops_t *zfs_xdvnodeops;
+const fs_operation_def_t zfs_xdvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_CREATE, { .vop_create = zfs_create },
+ VOPNAME_REMOVE, { .vop_remove = zfs_remove },
+ VOPNAME_LINK, { .vop_link = zfs_link },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_MKDIR, { .error = zfs_inval },
+ VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
+ VOPNAME_SYMLINK, { .error = zfs_inval },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Error vnode operations template
+ */
+vnodeops_t *zfs_evnodeops;
+const fs_operation_def_t zfs_evnodeops_template[] = {
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ NULL, NULL
+};
+#endif /* sun */
+
static int
ioflags(int ioflags)
{
@@ -4286,14 +5581,12 @@ zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex);
VM_OBJECT_UNLOCK(object);
-
va = zfs_map_page(mreq, &sf);
error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex),
size, va, DMU_READ_PREFETCH);
if (size != PAGE_SIZE)
bzero(va + size, PAGE_SIZE - size);
zfs_unmap_page(sf);
-
VM_OBJECT_LOCK(object);
if (!error)
@@ -4336,7 +5629,7 @@ zfs_freebsd_open(ap)
error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
if (error == 0)
- vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
+ vnode_create_vobject(vp, zp->z_size, ap->a_td);
return (error);
}
@@ -4411,7 +5704,6 @@ zfs_freebsd_access(ap)
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
- znode_phys_t *zphys = zp->z_phys;
accmode_t accmode;
int error = 0;
@@ -4428,9 +5720,8 @@ zfs_freebsd_access(ap)
if (error == 0) {
accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0) {
- error = vaccess(vp->v_type, zphys->zp_mode,
- zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred,
- NULL);
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred, NULL);
}
}
@@ -4439,8 +5730,9 @@ zfs_freebsd_access(ap)
* non-directories.
*/
if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
- (zphys->zp_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
+ (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
error = EACCES;
+ }
return (error);
}
@@ -4569,7 +5861,6 @@ zfs_freebsd_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
@@ -4616,7 +5907,6 @@ zfs_freebsd_setattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
@@ -4632,7 +5922,7 @@ zfs_freebsd_setattr(ap)
xva_init(&xvap);
xvap.xva_vattr = *vap;
- zflags = VTOZ(vp)->z_phys->zp_flags;
+ zflags = VTOZ(vp)->z_pflags;
if (vap->va_flags != VNOVAL) {
zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
@@ -4805,7 +6095,7 @@ zfs_reclaim_complete(void *arg, int pending)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf != NULL) {
+ if (zp->z_sa_hdl != NULL) {
ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
@@ -4830,8 +6120,9 @@ zfs_freebsd_reclaim(ap)
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ boolean_t rlocked;
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
ASSERT(zp != NULL);
@@ -4841,15 +6132,17 @@ zfs_freebsd_reclaim(ap)
vnode_destroy_vobject(vp);
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_phys != NULL);
zp->z_vnode = NULL;
mutex_exit(&zp->z_lock);
- if (zp->z_unlinked)
+ if (zp->z_unlinked) {
; /* Do nothing. */
- else if (zp->z_dbuf == NULL)
+ } else if (!rlocked) {
+ TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
+ taskqueue_enqueue(taskqueue_thread, &zp->z_task);
+ } else if (zp->z_sa_hdl == NULL) {
zfs_znode_free(zp);
- else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
+ } else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
int locked;
locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
@@ -4872,7 +6165,8 @@ zfs_freebsd_reclaim(ap)
vp->v_data = NULL;
ASSERT(vp->v_holdcnt >= 1);
VI_UNLOCK(vp);
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ if (rlocked)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index dbee467dbc20..1f3d0b2a31f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -47,6 +46,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
#include <sys/fs/zfs.h>
#include <sys/kidmap.h>
#endif /* _KERNEL */
@@ -56,9 +56,13 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#include <sys/refcount.h>
#include "zfs_prop.h"
+#include "zfs_comutil.h"
/* Used by fstat(1). */
SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
@@ -78,9 +82,6 @@ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
#define ZNODE_STAT_ADD(stat) /* nothing */
#endif /* ZNODE_STATS */
-#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
-#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
/*
* Functions needed for userland (ie: libzpool) are not put under
* #ifdef_KERNEL; the rest of the functions have dependencies
@@ -99,35 +100,11 @@ static kmem_cache_t *znode_cache = NULL;
static void
znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
{
-#if 1 /* XXXPJD: From OpenSolaris. */
/*
* We should never drop all dbuf refs without first clearing
* the eviction callback.
*/
panic("evicting znode %p\n", user_ptr);
-#else /* XXXPJD */
- znode_t *zp = user_ptr;
- vnode_t *vp;
-
- mutex_enter(&zp->z_lock);
- zp->z_dbuf = NULL;
- vp = ZTOV(zp);
- if (vp == NULL) {
- mutex_exit(&zp->z_lock);
- zfs_znode_free(zp);
- } else if (vp->v_count == 0) {
- zp->z_vnode = NULL;
- vhold(vp);
- mutex_exit(&zp->z_lock);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
- vrecycle(vp, curthread);
- VOP_UNLOCK(vp, 0);
- vdrop(vp);
- zfs_znode_free(zp);
- } else {
- mutex_exit(&zp->z_lock);
- }
-#endif
}
extern struct vop_vector zfs_vnodeops;
@@ -140,6 +117,7 @@ extern struct vop_vector zfs_shareops;
* to pass vfsp here, which is not possible, because argument
* 'cdrarg' is defined at kmem_cache_create() time.
*/
+/*ARGSUSED*/
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
{
@@ -160,6 +138,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_vnode = vp;
vp->v_data = (caddr_t)zp;
VN_LOCK_AREC(vp);
+ VN_LOCK_ASHARE(vp);
} else {
zp->z_vnode = NULL;
}
@@ -175,9 +154,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
- zp->z_dbuf = NULL;
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
+ zp->z_moved = 0;
return (0);
}
@@ -198,7 +177,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
}
@@ -215,7 +193,7 @@ static struct {
} znode_move_stats;
#endif /* ZNODE_STATS */
-#if defined(sun)
+#ifdef sun
static void
zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
{
@@ -240,11 +218,17 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
nzp->z_blksz = ozp->z_blksz;
nzp->z_seq = ozp->z_seq;
nzp->z_mapcnt = ozp->z_mapcnt;
- nzp->z_last_itx = ozp->z_last_itx;
nzp->z_gen = ozp->z_gen;
nzp->z_sync_cnt = ozp->z_sync_cnt;
- nzp->z_phys = ozp->z_phys;
- nzp->z_dbuf = ozp->z_dbuf;
+ nzp->z_is_sa = ozp->z_is_sa;
+ nzp->z_sa_hdl = ozp->z_sa_hdl;
+ bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+ nzp->z_links = ozp->z_links;
+ nzp->z_size = ozp->z_size;
+ nzp->z_pflags = ozp->z_pflags;
+ nzp->z_uid = ozp->z_uid;
+ nzp->z_gid = ozp->z_gid;
+ nzp->z_mode = ozp->z_mode;
/*
* Since this is just an idle znode and kmem is already dealing with
@@ -255,9 +239,7 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
ozp->z_acl_cached = NULL;
}
- /* Update back pointers. */
- (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
- znode_evict_error);
+ sa_set_userp(nzp->z_sa_hdl, nzp);
/*
* Invalidate the original znode by clearing fields that provide a
@@ -265,8 +247,14 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
* ensure that zfs_znode_move() recognizes the znode as invalid in any
* subsequent callback.
*/
- ozp->z_dbuf = NULL;
+ ozp->z_sa_hdl = NULL;
POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+ /*
+ * Mark the znode.
+ */
+ nzp->z_moved = 1;
+ ozp->z_moved = (uint8_t)-1;
}
/*ARGSUSED*/
@@ -389,14 +377,19 @@ zfs_znode_init(void)
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-#if defined(sun)
kmem_cache_set_move(znode_cache, zfs_znode_move);
-#endif
}
void
zfs_znode_fini(void)
{
+#ifdef sun
+ /*
+ * Cleanup vfs & vnode ops
+ */
+ zfs_remove_op_tables();
+#endif /* sun */
+
/*
* Cleanup zcache
*/
@@ -406,6 +399,100 @@ zfs_znode_fini(void)
rw_destroy(&zfsvfs_lock);
}
+#ifdef sun
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+struct vnodeops *zfs_sharevnodeops;
+
+void
+zfs_remove_op_tables()
+{
+ /*
+ * Remove vfs ops
+ */
+ ASSERT(zfsfstype);
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ zfsfstype = 0;
+
+ /*
+ * Remove vnode ops
+ */
+ if (zfs_dvnodeops)
+ vn_freevnodeops(zfs_dvnodeops);
+ if (zfs_fvnodeops)
+ vn_freevnodeops(zfs_fvnodeops);
+ if (zfs_symvnodeops)
+ vn_freevnodeops(zfs_symvnodeops);
+ if (zfs_xdvnodeops)
+ vn_freevnodeops(zfs_xdvnodeops);
+ if (zfs_evnodeops)
+ vn_freevnodeops(zfs_evnodeops);
+ if (zfs_sharevnodeops)
+ vn_freevnodeops(zfs_sharevnodeops);
+
+ zfs_dvnodeops = NULL;
+ zfs_fvnodeops = NULL;
+ zfs_symvnodeops = NULL;
+ zfs_xdvnodeops = NULL;
+ zfs_evnodeops = NULL;
+ zfs_sharevnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+extern const fs_operation_def_t zfs_sharevnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+ int error;
+
+ /*
+ * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+ * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+ * In this case we just return as the ops vectors are already set up.
+ */
+ if (zfs_dvnodeops)
+ return (0);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+ &zfs_dvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+ &zfs_fvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+ &zfs_symvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+ &zfs_xdvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+ &zfs_evnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
+ &zfs_sharevnodeops);
+
+ return (error);
+}
+#endif /* sun */
+
int
zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
@@ -424,9 +511,12 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
sharezp->z_vnode = &vnode;
vnode.v_data = sharezp;
@@ -436,8 +526,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
kcred, NULL, &acl_ids));
- zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
- &zp, 0, &acl_ids);
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, sharezp);
POINTER_INVALIDATE(&sharezp->z_zfsvfs);
error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
@@ -449,9 +538,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
ZTOV(sharezp)->v_count = 0;
ZTOV(sharezp)->v_holdcnt = 0;
zp->z_vnode = NULL;
+ sa_handle_destroy(sharezp->z_sa_hdl);
sharezp->z_vnode = NULL;
- dmu_buf_rele(sharezp->z_dbuf, NULL);
- sharezp->z_dbuf = NULL;
kmem_cache_free(znode_cache, sharezp);
return (error);
@@ -498,26 +586,25 @@ zfs_cmpldev(uint64_t dev)
}
static void
-zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
{
- znode_t *nzp;
-
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_dbuf == NULL);
+ ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL);
- zp->z_dbuf = db;
- nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
- /*
- * there should be no
- * concurrent zgets on this object.
- */
- if (nzp != NULL)
- panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
/*
* Slap on VROOT if we are the root znode
@@ -532,14 +619,12 @@ zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
void
zfs_znode_dmu_fini(znode_t *zp)
{
- dmu_buf_t *db = zp->z_dbuf;
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
zp->z_unlinked ||
RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
- ASSERT(zp->z_dbuf != NULL);
- zp->z_dbuf = NULL;
- VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
- dmu_buf_rele(db, NULL);
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
}
/*
@@ -550,47 +635,67 @@ zfs_znode_dmu_fini(znode_t *zp)
* return the znode
*/
static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
{
znode_t *zp;
vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
ASSERT(zp->z_dirlocks == NULL);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
/*
* Defer setting z_zfsvfs until the znode is ready to be a candidate for
* the zfs_znode_move() callback.
*/
- zp->z_phys = NULL;
+ zp->z_sa_hdl = NULL;
zp->z_unlinked = 0;
zp->z_atime_dirty = 0;
zp->z_mapcnt = 0;
- zp->z_last_itx = 0;
zp->z_id = db->db_object;
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
vp = ZTOV(zp);
-#ifdef TODO
- vn_reinit(vp);
-#endif
- zfs_znode_dmu_init(zfsvfs, zp, db);
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, 8);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
- zp->z_gen = zp->z_phys->zp_gen;
+ zp->z_mode = mode;
-#if 0
- if (vp == NULL)
- return (zp);
-#endif
+ vp->v_type = IFTOVT((mode_t)mode);
- vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
switch (vp->v_type) {
case VDIR:
zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
@@ -598,8 +703,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
case VFIFO:
vp->v_op = &zfs_fifoops;
break;
- case VREG:
- if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
+ case VREG:
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
vp->v_op = &zfs_shareops;
}
break;
@@ -621,6 +727,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
return (zp);
}
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
/*
* Create a new DMU object to hold a zfs znode.
*
@@ -640,14 +749,23 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
*/
void
zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
{
- dmu_buf_t *db;
- znode_phys_t *pzp;
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ dmu_buf_t *db;
timestruc_t now;
uint64_t gen, obj;
int err;
+ int bonuslen;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t sa_attrs[ZPL_END];
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
@@ -661,12 +779,16 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
gen = dmu_tx_get_txg(tx);
}
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+
/*
* Create a new DMU object.
*/
/*
* There's currently no mechanism for pre-reading the blocks that will
- * be to needed allocate a new object, so we accept the small chance
+ * be needed to allocate a new object, so we accept the small chance
* that there will be an i/o error and we will fail one of the
* assertions below.
*/
@@ -674,103 +796,202 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (zfsvfs->z_replay) {
err = zap_create_claim_norm(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = zap_create_norm(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
} else {
if (zfsvfs->z_replay) {
err = dmu_object_claim(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
}
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
- dmu_buf_will_dirty(db, tx);
-
- /*
- * Initialize the znode physical data to zero.
- */
- ASSERT(db->db_size >= sizeof (znode_phys_t));
- bzero(db->db_data, db->db_size);
- pzp = db->db_data;
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
* to reference the just-allocated physical data area.
*/
if (flag & IS_ROOT_NODE) {
- dzp->z_dbuf = db;
- dzp->z_phys = pzp;
dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
}
/*
* If parent is an xattr, so am I.
*/
- if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ if (dzp_pflags & ZFS_XATTR) {
flag |= IS_XATTR;
-
- if (vap->va_type == VBLK || vap->va_type == VCHR) {
- pzp->zp_rdev = zfs_expldev(vap->va_rdev);
}
if (zfsvfs->z_use_fuids)
- pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
if (vap->va_type == VDIR) {
- pzp->zp_size = 2; /* contents ("." and "..") */
- pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
}
- pzp->zp_parent = dzp->z_id;
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
if (flag & IS_XATTR)
- pzp->zp_flags |= ZFS_XATTR;
+ pflags |= ZFS_XATTR;
- pzp->zp_gen = gen;
+ /*
+ * No execs denied will be deterimed when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
- ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
- ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (vap->va_mask & AT_ATIME) {
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&now, atime);
}
if (vap->va_mask & AT_MTIME) {
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&now, mtime);
}
- pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
if (!(flag & IS_ROOT_NODE)) {
- *zpp = zfs_znode_alloc(zfsvfs, db, 0);
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
} else {
/*
* If we are creating the root node, the "parent" we
* passed in is the znode for the root.
*/
*zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
}
- pzp->zp_uid = acl_ids->z_fuid;
- pzp->zp_gid = acl_ids->z_fgid;
- pzp->zp_mode = acl_ids->z_mode;
- VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+
if (vap->va_mask & AT_XVATTR)
- zfs_xvattr_set(*zpp, (xvattr_t *)vap);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
+ ASSERT3P(err, ==, 0);
+ }
if (!(flag & IS_ROOT_NODE)) {
vnode_t *vp;
@@ -780,10 +1001,16 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
vp->v_vflag &= ~VV_FORCEINSMQ;
KASSERT(err == 0, ("insmntque() failed: error %d", err));
}
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
}
+/*
+ * zfs_xvattr_set only updates the in-core attributes
+ * it is assumed the caller will be doing an sa_bulk_update
+ * to push the changes out
+ */
void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
{
xoptattr_t *xoap;
@@ -791,60 +1018,86 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
ASSERT(xoap);
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ &times, sizeof (times), tx);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
- xoap->xoa_av_quarantined);
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
- sizeof (xoap->xoa_av_scanstamp));
- zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+ zfs_sa_set_scanstamp(zp, xvap, tx);
XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
}
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
int
@@ -853,41 +1106,50 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
dmu_object_info_t doi;
dmu_buf_t *db;
znode_t *zp;
- vnode_t *vp;
- int err, first = 1;
+ int err;
+ sa_handle_t *hdl;
+ int first = 1;
*zpp = NULL;
+
again:
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- zp = dmu_buf_get_user(db);
- if (zp != NULL) {
- mutex_enter(&zp->z_lock);
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
/*
- * Since we do immediate eviction of the z_dbuf, we
- * should never find a dbuf with a znode that doesn't
- * know about the dbuf.
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
*/
- ASSERT3P(zp->z_dbuf, ==, db);
+
+ ASSERT3P(zp, !=, NULL);
+
+ mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_unlinked) {
err = ENOENT;
} else {
+ vnode_t *vp;
int dying = 0;
vp = ZTOV(zp);
@@ -915,7 +1177,7 @@ again:
* znode is dying so we can't reuse it, we must
* wait until destruction is completed.
*/
- dmu_buf_rele(db, NULL);
+ sa_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
if (vp != NULL)
@@ -926,7 +1188,7 @@ again:
*zpp = zp;
err = 0;
}
- dmu_buf_rele(db, NULL);
+ sa_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -938,24 +1200,30 @@ again:
*
* There is a small window where zfs_vget() could
* find this object while a file create is still in
- * progress. Since a gen number can never be zero
- * we will check that to determine if its an allocated
- * file.
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
*/
-
- if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = ENOENT;
+ } else {
*zpp = zp;
- vp = ZTOV(zp);
- vp->v_vflag |= VV_FORCEINSMQ;
+ }
+ if (err == 0) {
+ vnode_t *vp = ZTOV(zp);
+
err = insmntque(vp, zfsvfs->z_vfs);
- vp->v_vflag &= ~VV_FORCEINSMQ;
- KASSERT(err == 0, ("insmntque() failed: error %d", err));
- VOP_UNLOCK(vp, 0);
- err = 0;
- } else {
- dmu_buf_rele(db, NULL);
- err = ENOENT;
+ if (err == 0)
+ VOP_UNLOCK(vp, 0);
+ else {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ *zpp = NULL;
+ }
}
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -968,40 +1236,91 @@ zfs_rezget(znode_t *zp)
dmu_object_info_t doi;
dmu_buf_t *db;
uint64_t obj_num = zp->z_id;
+ uint64_t mode, size;
+ sa_bulk_attr_t bulk[8];
int err;
+ int count = 0;
+ uint64_t gen;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
- dmu_buf_rele(db, NULL);
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+ size = zp->z_size;
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, sizeof (zp->z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, sizeof (zp->z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
}
- mutex_enter(&zp->z_acl_lock);
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
+ zp->z_mode = mode;
+
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
+ }
+
+ /*
+ * XXXPJD: Not sure how is that possible, but under heavy
+ * zfs recv -F load it happens that z_gen is the same, but
+ * vnode type is different than znode type. This would mean
+ * that for example regular file was replaced with directory
+ * which has the same object number.
+ */
+ if (ZTOV(zp) != NULL &&
+ ZTOV(zp)->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
}
- mutex_exit(&zp->z_acl_lock);
- zfs_znode_dmu_init(zfsvfs, zp, db);
- zp->z_unlinked = (zp->z_phys->zp_links == 0);
+ zp->z_unlinked = (zp->z_links == 0);
zp->z_blksz = doi.doi_data_block_size;
+ if (zp->z_size != size && ZTOV(zp) != NULL)
+ vnode_pager_setsize(ZTOV(zp), zp->z_size);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1014,11 +1333,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
uint64_t obj = zp->z_id;
- uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ uint64_t acl_obj = zfs_external_acl(zp);
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- if (acl_obj)
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
VERIFY(0 == dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1033,7 +1354,7 @@ zfs_zinactive(znode_t *zp)
uint64_t z_id = zp->z_id;
int vfslocked;
- ASSERT(zp->z_dbuf && zp->z_phys);
+ ASSERT(zp->z_sa_hdl);
/*
* Don't allow a zfs_zget() while were trying to release this znode
@@ -1069,6 +1390,7 @@ zfs_zinactive(znode_t *zp)
VFS_UNLOCK_GIANT(vfslocked);
return;
}
+
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
}
@@ -1079,6 +1401,7 @@ zfs_znode_free(znode_t *zp)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(ZTOV(zp) == NULL);
+ ASSERT(zp->z_sa_hdl == NULL);
mutex_enter(&zfsvfs->z_znodes_lock);
POINTER_INVALIDATE(&zp->z_zfsvfs);
list_remove(&zfsvfs->z_all_znodes, zp);
@@ -1095,59 +1418,40 @@ zfs_znode_free(znode_t *zp)
}
void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
{
timestruc_t now;
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
gethrestime(&now);
- if (tx) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
zp->z_atime_dirty = 0;
zp->z_seq++;
} else {
zp->z_atime_dirty = 1;
}
- if (flag & AT_ATIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
if (flag & AT_MTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
- if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
}
if (flag & AT_CTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+ zp->z_pflags |= ZFS_ARCHIVE;
}
}
/*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- * 1 - Only the ACCESS time is ever updated outside of a transaction.
- * 2 - Multiple consecutive updates will be collapsed into a single
- * znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
- mutex_enter(&zp->z_lock);
- zfs_time_stamper_locked(zp, flag, tx);
- mutex_exit(&zp->z_lock);
-}
-
-/*
* Grow the block size for a file.
*
* IN: zp - znode of file to free data in.
@@ -1169,19 +1473,36 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
* we will not grow. If there is more than one block in a file,
* the blocksize cannot change.
*/
- if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
return;
error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
size, 0, tx);
+
if (error == ENOTSUP)
return;
ASSERT3U(error, ==, 0);
/* What blocksize did we actually get? */
- dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
}
+#ifdef sun
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage(). E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+ int flags, cred_t *cr)
+{
+ ASSERT(0);
+ return (0);
+}
+#endif /* sun */
+
/*
* Increase the file length
*
@@ -1208,13 +1529,14 @@ zfs_extend(znode_t *zp, uint64_t end)
/*
* Nothing to do if file already at desired length.
*/
- if (end <= zp->z_phys->zp_size) {
+ if (end <= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
if (end > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
/*
@@ -1242,19 +1564,21 @@ top:
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
if (newblksz)
zfs_grow_blocksize(zp, newblksz, tx);
- zp->z_phys->zp_size = end;
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ vnode_pager_setsize(ZTOV(zp), end);
zfs_range_unlock(rl);
dmu_tx_commit(tx);
- vnode_pager_setsize(ZTOV(zp), end);
-
return (0);
}
@@ -1283,20 +1607,21 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
/*
* Nothing to do if file already at desired length.
*/
- if (off >= zp->z_phys->zp_size) {
+ if (off >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
- if (off + len > zp->z_phys->zp_size)
- len = zp->z_phys->zp_size - off;
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
if (error == 0) {
/*
* In FreeBSD we cannot free block in the middle of a file,
- * but only at the end of a file.
+ * but only at the end of a file, so this code path should
+ * never happen.
*/
vnode_pager_setsize(ZTOV(zp), off);
}
@@ -1323,6 +1648,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
dmu_tx_t *tx;
rl_t *rl;
int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
/*
* We will change zp_size, lock the whole file.
@@ -1332,7 +1659,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
/*
* Nothing to do if file already at desired length.
*/
- if (end >= zp->z_phys->zp_size) {
+ if (end >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
@@ -1344,7 +1671,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1356,9 +1684,17 @@ top:
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_size = end;
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
dmu_tx_commit(tx);
@@ -1394,9 +1730,17 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
int error;
- if (off > zp->z_phys->zp_size) {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
error = zfs_extend(zp, off+len);
if (error == 0 && log)
goto log;
@@ -1404,18 +1748,29 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
return (error);
}
+ /*
+ * Check for any locks in the region to be freed.
+ */
+
+ if (MANDLOCK(vp, (mode_t)mode)) {
+ uint64_t length = (len ? len : zp->z_size - off);
+ if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+ return (error);
+ }
+
if (len == 0) {
error = zfs_trunc(zp, off);
} else {
if ((error = zfs_free_range(zp, off, len)) == 0 &&
- off + len > zp->z_phys->zp_size)
+ off + len > zp->z_size)
error = zfs_extend(zp, off+len);
}
if (error || !log)
return (error);
log:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1427,7 +1782,14 @@ log:
return (error);
}
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
dmu_tx_commit(tx);
@@ -1438,7 +1800,7 @@ void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
zfsvfs_t zfsvfs;
- uint64_t moid, obj, version;
+ uint64_t moid, obj, sa_obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0;
nvpair_t *elem;
@@ -1465,12 +1827,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
/*
* Set starting attributes.
*/
- if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
- version = ZPL_VERSION;
- else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
- version = ZPL_VERSION_USERSPACE - 1;
- else
- version = ZPL_VERSION_FUID - 1;
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
elem = NULL;
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */
@@ -1496,6 +1853,18 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
/*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
* Create a delete queue.
*/
obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
@@ -1514,22 +1883,32 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vattr.va_uid = crgetuid(cr);
vattr.va_gid = crgetgid(cr);
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(rootzp, NULL, 0);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
vnode.v_type = VDIR;
vnode.v_data = rootzp;
rootzp->z_vnode = &vnode;
- bzero(&zfsvfs, sizeof (zfsvfs_t));
-
zfsvfs.z_os = os;
zfsvfs.z_parent = &zfsvfs;
zfsvfs.z_version = version;
zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs.z_use_sa = USE_SA(version, os);
zfsvfs.z_norm = norm;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs.z_attr_table);
+
+ ASSERT(error == 0);
+
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
@@ -1544,19 +1923,17 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
ASSERT(error == 0);
zfs_acl_ids_free(&acl_ids);
POINTER_INVALIDATE(&rootzp->z_zfsvfs);
- dmu_buf_rele(rootzp->z_dbuf, NULL);
- rootzp->z_dbuf = NULL;
+ sa_handle_destroy(rootzp->z_sa_hdl);
rootzp->z_vnode = NULL;
kmem_cache_free(znode_cache, rootzp);
@@ -1573,44 +1950,122 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
#endif /* _KERNEL */
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
+
static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
{
- dmu_buf_t *db;
dmu_object_info_t doi;
- znode_phys_t *zp;
int error;
- if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
return (error);
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ doi.doi_bonus_type == DMU_OT_ZNODE &&
doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, FTAG);
- return (EINVAL);
+ sa_buf_rele(*db, tag);
+ return (ENOTSUP);
}
- zp = db->db_data;
- *pobjp = zp->zp_parent;
- *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
- S_ISDIR(zp->zp_mode);
- dmu_buf_rele(db, FTAG);
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
return (0);
}
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp,
+ int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ *pobjp = parent;
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
char *path = buf + len - 1;
int error;
*path = '\0';
+ sa_hdl = hdl;
for (;;) {
uint64_t pobj;
@@ -1618,7 +2073,10 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
size_t complen;
int is_xattrdir;
- if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+ if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
&is_xattrdir)) != 0)
break;
@@ -1643,9 +2101,80 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
ASSERT(path >= buf);
bcopy(component, path, complen);
obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
}
if (error == 0)
(void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 490e50fbb94c..5c7b22e2a3f3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -19,13 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/zfs_context.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -34,8 +34,9 @@
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@@ -66,11 +67,11 @@
/*
* This global ZIL switch affects all pools
*/
-int zil_disable = 0; /* disable intent logging */
+int zil_replay_disable = 0; /* disable intent logging replay */
SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RW, &zil_disable, 0,
- "Disable ZFS Intent Log (ZIL)");
+TUNABLE_INT("vfs.zfs.zil_replay_disable", &zil_replay_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RW,
+ &zil_replay_disable, 0, "Disable intent logging replay");
/*
* Tunable parameter for debugging or performance analysis. Setting
@@ -84,11 +85,26 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
static kmem_cache_t *zil_lwb_cache;
+static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
+
+#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
{
- const dva_t *dva1 = x1;
- const dva_t *dva2 = x2;
+ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
return (-1);
@@ -104,34 +120,37 @@ zil_dva_compare(const void *x1, const void *x2)
}
static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
{
- avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
- offsetof(zil_dva_node_t, zn_node));
+ avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
}
static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ zil_bp_node_t *zn;
void *cookie = NULL;
while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(zn, sizeof (zil_dva_node_t));
+ kmem_free(zn, sizeof (zil_bp_node_t));
avl_destroy(t);
}
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ const dva_t *dva = BP_IDENTITY(bp);
+ zil_bp_node_t *zn;
avl_index_t where;
if (avl_find(t, dva, &where) != NULL)
return (EEXIST);
- zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
zn->zn_dva = *dva;
avl_insert(t, zn, where);
@@ -156,35 +175,31 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
}
/*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
*/
static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+ char **end)
{
- blkptr_t blk = *bp;
- zbookmark_t zb;
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_t zb;
int error;
- zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
- *abufpp = NULL;
+ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
- /*
- * We shouldn't be doing any scrubbing while we're doing log
- * replay, it's OK to not lock.
- */
- error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
- arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
- char *data = (*abufpp)->b_data;
- uint64_t blksz = BP_GET_LSIZE(bp);
- zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
zio_cksum_t cksum = bp->blk_cksum;
/*
@@ -197,43 +212,102 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
- if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
- (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
- error = ECKSUM;
- }
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = abuf->b_data;
+ char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
- if (error) {
- VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
- *abufpp = NULL;
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, len);
+ *end = (char *)dst + len;
+ *nbp = zilc->zc_next_blk;
+ }
+ } else {
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ (zilc->zc_nused > (size - sizeof (*zilc)))) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
+ *nbp = zilc->zc_next_blk;
+ }
}
+
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+ return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_t zb;
+ int error;
+
+ if (BP_IS_HOLE(bp)) {
+ if (wbuf != NULL)
+ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ return (0);
+ }
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ if (wbuf != NULL)
+ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ }
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
*/
-uint64_t
+int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
{
const zil_header_t *zh = zilog->zl_header;
- uint64_t claim_seq = zh->zh_claim_seq;
- uint64_t seq = 0;
- uint64_t max_seq = 0;
- blkptr_t blk = zh->zh_log;
- arc_buf_t *abuf;
+ boolean_t claimed = !!zh->zh_claim_txg;
+ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+ uint64_t max_blk_seq = 0;
+ uint64_t max_lr_seq = 0;
+ uint64_t blk_count = 0;
+ uint64_t lr_count = 0;
+ blkptr_t blk, next_blk;
char *lrbuf, *lrp;
- zil_trailer_t *ztp;
- int reclen, error;
+ int error = 0;
- if (BP_IS_HOLE(&blk))
- return (max_seq);
+ /*
+ * Old logs didn't record the maximum zh_claim_lr_seq.
+ */
+ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ claim_lr_seq = UINT64_MAX;
/*
* Starting at the block pointed to by zh_log we read the log chain.
@@ -244,105 +318,156 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- zil_dva_tree_init(&zilog->zl_dva_tree);
- for (;;) {
- seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ zil_bp_tree_init(zilog);
- if (claim_seq != 0 && seq > claim_seq)
- break;
-
- ASSERT(max_seq < seq);
- max_seq = seq;
+ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ int reclen;
+ char *end;
- error = zil_read_log_block(zilog, &blk, &abuf);
+ if (blk_seq > claim_blk_seq)
+ break;
+ if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+ break;
+ ASSERT3U(max_blk_seq, <, blk_seq);
+ max_blk_seq = blk_seq;
+ blk_count++;
- if (parse_blk_func != NULL)
- parse_blk_func(zilog, &blk, arg, txg);
+ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+ break;
+ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
if (error)
break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
-
- if (parse_lr_func == NULL) {
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- continue;
- }
-
- for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- parse_lr_func(zilog, lr, arg, txg);
+ if (lr->lrc_seq > claim_lr_seq)
+ goto done;
+ if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+ goto done;
+ ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+ max_lr_seq = lr->lrc_seq;
+ lr_count++;
}
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+ zilog->zl_parse_error = error;
+ zilog->zl_parse_blk_seq = max_blk_seq;
+ zilog->zl_parse_lr_seq = max_lr_seq;
+ zilog->zl_parse_blk_count = blk_count;
+ zilog->zl_parse_lr_count = lr_count;
+
+ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+ zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
- return (max_seq);
+ return (error);
}
-/* ARGSUSED */
-static void
+static int
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
- spa_t *spa = zilog->zl_spa;
- int err;
-
/*
* Claim log block if not already committed and not already claimed.
+ * If tx == NULL, just verify that the block is claimable.
*/
- if (bp->blk_birth >= first_txg &&
- zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
- err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
- ASSERT(err == 0);
- }
+ if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
}
-static void
+static int
zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
{
- if (lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
- }
+ lr_write_t *lr = (lr_write_t *)lrc;
+ int error;
+
+ if (lrc->lrc_txtype != TX_WRITE)
+ return (0);
+
+ /*
+ * If the block is not readable, don't claim it. This can happen
+ * in normal operation when a log block is written to disk before
+ * some of the dmu_sync() blocks it points to. In this case, the
+ * transaction cannot have been committed to anyone (we would have
+ * waited for all writes to be stable first), so it is semantically
+ * correct to declare this the end of the log.
+ */
+ if (lr->lr_blkptr.blk_birth >= first_txg &&
+ (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+ return (error);
+ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
/* ARGSUSED */
-static void
+static int
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
- zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+ zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
}
-static void
+static int
zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
/*
* If we previously claimed it, we need to free it.
*/
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- if (bp->blk_birth >= claim_txg &&
- !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
- (void) arc_free(NULL, zilog->zl_spa,
- dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
- }
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+ lwb_t *lwb;
+
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = *bp;
+ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+ lwb->lwb_tx = NULL;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ return (lwb);
}
/*
* Create an on-disk intent log.
*/
-static void
+static lwb_t *
zil_create(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
- lwb_t *lwb;
+ lwb_t *lwb = NULL;
uint64_t txg = 0;
dmu_tx_t *tx = NULL;
blkptr_t blk;
@@ -359,22 +484,23 @@ zil_create(zilog_t *zilog)
blk = zh->zh_log;
/*
- * If we don't already have an initial log block or we have one
- * but it's the wrong endianness then allocate one.
+ * Allocate an initial log block if:
+ * - there isn't one already
+ * - the existing block is the wrong endianess
*/
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
- zio_free_blk(zilog->zl_spa, &blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
- error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
- NULL, txg);
+ error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+ ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -383,20 +509,8 @@ zil_create(zilog_t *zilog)
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
- if (error == 0) {
- lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
- lwb->lwb_zilog = zilog;
- lwb->lwb_blk = blk;
- lwb->lwb_nused = 0;
- lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
- lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
- lwb->lwb_max_txg = txg;
- lwb->lwb_zio = NULL;
-
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, lwb);
- mutex_exit(&zilog->zl_lock);
- }
+ if (error == 0)
+ lwb = zil_alloc_lwb(zilog, &blk, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -409,6 +523,8 @@ zil_create(zilog_t *zilog)
}
ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+ return (lwb);
}
/*
@@ -433,26 +549,18 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+ zilog->zl_old_header = *zh; /* debugging aid */
+
if (BP_IS_HOLE(&zh->zh_log))
return;
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
mutex_enter(&zilog->zl_lock);
- /*
- * It is possible for the ZIL to get the previously mounted zilog
- * structure of the same dataset if quickly remounted and the dbuf
- * eviction has not completed. In this case we can see a non
- * empty lwb list and keep_first will be set. We fix this by
- * clearing the keep_first. This will be slower but it's very rare.
- */
- if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
- keep_first = B_FALSE;
-
ASSERT3U(zilog->zl_destroy_txg, <, txg);
zilog->zl_destroy_txg = txg;
zilog->zl_keep_first = keep_first;
@@ -464,41 +572,20 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
list_remove(&zilog->zl_lwb_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
}
- } else {
- if (!keep_first) {
- (void) zil_parse(zilog, zil_free_log_block,
- zil_free_log_record, tx, zh->zh_claim_txg);
- }
+ } else if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
}
mutex_exit(&zilog->zl_lock);
dmu_tx_commit(tx);
}
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- arc_buf_t *abuf = NULL;
-
- if (BP_IS_HOLE(&zh->zh_log))
- return (B_TRUE);
-
- if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
- return (B_TRUE);
-
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- return (B_FALSE);
-}
-
int
-zil_claim(char *osname, void *txarg)
+zil_claim(const char *osname, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -507,7 +594,7 @@ zil_claim(char *osname, void *txarg)
objset_t *os;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
@@ -516,28 +603,13 @@ zil_claim(char *osname, void *txarg)
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
- if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
if (!BP_IS_HOLE(&zh->zh_log))
- zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+ zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
- }
-
- /*
- * Record here whether the zil has any records to replay.
- * If the header block pointer is null or the block points
- * to the stubby then we know there are no valid log records.
- * We use the header to store this state as the the zilog gets
- * freed later in dmu_objset_close().
- * The flags (and the rest of the header fields) are cleared in
- * zil_sync() as a result of a zil_destroy(), after replaying the log.
- *
- * Note, the intent log can be empty but still need the
- * stubby to be claimed.
- */
- if (!zil_empty(zilog)) {
- zh->zh_flags |= ZIL_REPLAY_NEEDED;
- dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ dmu_objset_rele(os, FTAG);
+ return (0);
}
/*
@@ -549,14 +621,19 @@ zil_claim(char *osname, void *txarg)
*/
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
- zh->zh_claim_txg = first_txg;
- zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+ (void) zil_parse(zilog, zil_claim_log_block,
zil_claim_log_record, tx, first_txg);
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (0);
}
@@ -565,53 +642,67 @@ zil_claim(char *osname, void *txarg)
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
-/* ARGSUSED */
int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(const char *osname, void *tx)
{
zilog_t *zilog;
- zil_header_t *zh;
- blkptr_t blk;
- arc_buf_t *abuf;
objset_t *os;
- char *lrbuf;
- zil_trailer_t *ztp;
+ blkptr_t *bp;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ ASSERT(tx == NULL);
+
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
}
zilog = dmu_objset_zil(os);
- zh = zil_header_in_syncing_context(zilog);
- blk = zh->zh_log;
- if (BP_IS_HOLE(&blk)) {
- dmu_objset_close(os);
- return (0); /* no chain */
- }
+ bp = (blkptr_t *)&zilog->zl_header->zh_log;
- for (;;) {
- error = zil_read_log_block(zilog, &blk, &abuf);
- if (error)
- break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the log
+ * as its content should have already been synced to the pool.
+ */
+ if (!BP_IS_HOLE(bp)) {
+ vdev_t *vd;
+ boolean_t valid = B_TRUE;
+
+ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (vd->vdev_islog && vdev_is_dead(vd))
+ valid = vdev_log_state_valid(vd);
+ spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+ if (!valid) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
}
- dmu_objset_close(os);
- if (error == ECKSUM)
- return (0); /* normal end of chain */
- return (error);
+
+ /*
+ * Because tx == NULL, zil_claim_log_block() will not actually claim
+ * any blocks, but just determine whether it is possible to do so.
+ * In addition to checking the log chain, zil_claim_log_block()
+ * will invoke zio_claim() with a done func of spa_claim_notify(),
+ * which will update spa_max_claim_txg. See spa_load() for details.
+ */
+ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+ zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+
+ dmu_objset_rele(os, FTAG);
+
+ return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
static int
zil_vdev_compare(const void *x1, const void *x2)
{
- uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
- uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
if (v1 < v2)
return (-1);
@@ -622,7 +713,7 @@ zil_vdev_compare(const void *x1, const void *x2)
}
void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_vdev_tree;
avl_index_t where;
@@ -652,7 +743,7 @@ zil_add_block(zilog_t *zilog, blkptr_t *bp)
mutex_exit(&zilog->zl_vdev_lock);
}
-void
+static void
zil_flush_vdevs(zilog_t *zilog)
{
spa_t *spa = zilog->zl_spa;
@@ -698,9 +789,9 @@ zil_lwb_write_done(zio_t *zio)
{
lwb_t *lwb = zio->io_private;
zilog_t *zilog = lwb->lwb_zilog;
+ dmu_tx_t *tx = lwb->lwb_tx;
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -719,17 +810,15 @@ zil_lwb_write_done(zio_t *zio)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
- if (zio->io_error)
- zilog->zl_log_error = B_TRUE;
+ lwb->lwb_tx = NULL;
+ mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync. We still have the
- * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+ * which we allocated the next block sync.
*/
- txg_rele_to_sync(&lwb->lwb_txgh);
- mutex_exit(&zilog->zl_lock);
+ dmu_tx_commit(tx);
}
/*
@@ -740,10 +829,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
{
zbookmark_t zb;
- zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
if (zilog->zl_root_zio == NULL) {
zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -751,118 +839,147 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
}
if (lwb->lwb_zio == NULL) {
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+ 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
/*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+ 4096, /* non TX_WRITE */
+ 8192+4096, /* data base */
+ 32*1024 + 4096, /* NFS writes */
+ UINT64_MAX
+};
+
+/*
+ * Use the slog as long as the logbias is 'latency' and the current commit size
+ * is less than the limit or the total list size is less than 2X the limit.
+ * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ */
+uint64_t zil_slog_limit = 1024 * 1024;
+#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
+ (((zilog)->zl_cur_used < zil_slog_limit) || \
+ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+
+/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *nlwb;
- zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ lwb_t *nlwb = NULL;
+ zil_chain_t *zilc;
spa_t *spa = zilog->zl_spa;
- blkptr_t *bp = &ztp->zit_next_blk;
+ blkptr_t *bp;
+ dmu_tx_t *tx;
uint64_t txg;
- uint64_t zil_blksz;
- int error;
+ uint64_t zil_blksz, wsz;
+ int i, error;
- ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
+ }
+
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
- * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+ * We dirty the dataset to ensure that zil_sync() will be called
+ * to clean up in the event of allocation failure or I/O failure.
*/
- txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
- txg_rele_to_quiesce(&lwb->lwb_txgh);
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ lwb->lwb_tx = tx;
/*
- * Pick a ZIL blocksize. We request a size that is the
- * maximum of the previous used size, the current used size and
- * the amount waiting in the queue.
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on size used in the last block.
+ * - first find the smallest bucket that will fit the block from a
+ * limited set of block sizes. This is because it's faster to write
+ * blocks allocated from the same metaslab as they are adjacent or
+ * close.
+ * - next find the maximum from the new suggested size and an array of
+ * previous sizes. This lessens a picket fence effect of wrongly
+ * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+ * requests.
+ *
+ * Note we only write what is used, but we can't just allocate
+ * the maximum block size because we can exhaust the available
+ * pool log space.
*/
- zil_blksz = MAX(zilog->zl_prev_used,
- zilog->zl_cur_used + sizeof (*ztp));
- zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
- zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
- if (zil_blksz > ZIL_MAX_BLKSZ)
- zil_blksz = ZIL_MAX_BLKSZ;
+ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+ continue;
+ zil_blksz = zil_block_buckets[i];
+ if (zil_blksz == UINT64_MAX)
+ zil_blksz = SPA_MAXBLOCKSIZE;
+ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+ for (i = 0; i < ZIL_PREV_BLKS; i++)
+ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
- if (error) {
- dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
- /*
- * We dirty the dataset to ensure that zil_sync() will
- * be called to remove this lwb from our zl_lwb_list.
- * Failing to do so, may leave an lwb with a NULL lwb_buf
- * hanging around on the zl_lwb_list.
- */
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- dmu_tx_commit(tx);
+ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
+ USE_SLOG(zilog));
+ if (!error) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
/*
- * Since we've just experienced an allocation failure so we
- * terminate the current lwb and send it on its way.
+ * Allocate a new log write buffer (lwb).
*/
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- zio_nowait(lwb->lwb_zio);
+ nlwb = zil_alloc_lwb(zilog, bp, txg);
- /*
- * By returning NULL the caller will call tx_wait_synced()
- */
- return (NULL);
+ /* Record the block for later vdev flushing */
+ zil_add_block(zilog, &lwb->lwb_blk);
}
- ASSERT3U(bp->blk_birth, ==, txg);
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_zio, wsz);
- /*
- * Allocate a new log write buffer (lwb).
- */
- nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ } else {
+ wsz = lwb->lwb_sz;
+ }
- nlwb->lwb_zilog = zilog;
- nlwb->lwb_blk = *bp;
- nlwb->lwb_nused = 0;
- nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
- nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
- nlwb->lwb_max_txg = txg;
- nlwb->lwb_zio = NULL;
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
/*
- * Put new lwb at the end of the log chain
+ * clear unused data for security
*/
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, nlwb);
- mutex_exit(&zilog->zl_lock);
+ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
- /* Record the block for later vdev flushing */
- zil_add_block(zilog, &lwb->lwb_blk);
+ zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
/*
- * kick off the write for the old log block
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
*/
- dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
- ASSERT(lwb->lwb_zio);
- zio_nowait(lwb->lwb_zio);
-
return (nlwb);
}
@@ -870,20 +987,20 @@ static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lr = (lr_write_t *)lrc;
+ lr_write_t *lrw = (lr_write_t *)lrc;
+ char *lr_buf;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen;
+ uint64_t dlen = 0;
if (lwb == NULL)
return (NULL);
+
ASSERT(lwb->lwb_buf != NULL);
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
dlen = P2ROUNDUP_TYPED(
- lr->lr_length, sizeof (uint64_t), uint64_t);
- else
- dlen = 0;
+ lrw->lr_length, sizeof (uint64_t), uint64_t);
zilog->zl_cur_used += (reclen + dlen);
@@ -892,24 +1009,22 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
/*
* If this record won't fit in the current log block, start a new one.
*/
- if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_init(zilog, lwb);
- ASSERT(lwb->lwb_nused == 0);
- if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ ASSERT(LWB_EMPTY(lwb));
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
}
}
- /*
- * Update the lrc_seq, to be log record sequence number. See zil.h
- * Then copy the record to the log buffer.
- */
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ bcopy(lrc, lr_buf, reclen);
+ lrc = (lr_t *)lr_buf;
+ lrw = (lr_write_t *)lrc;
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@@ -921,18 +1036,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
char *dbuf;
int error;
- /* alignment is guaranteed */
- lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
if (dlen) {
ASSERT(itx->itx_wr_state == WR_NEED_COPY);
- dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
- lr->lr_common.lrc_reclen += dlen;
+ dbuf = lr_buf + reclen;
+ lrw->lr_common.lrc_reclen += dlen;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
error = zilog->zl_get_data(
- itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ itx->itx_private, lrw, dbuf, lwb->lwb_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
@@ -945,9 +1058,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
}
}
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
lwb->lwb_nused += reclen + dlen;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
- ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
return (lwb);
@@ -965,247 +1085,446 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
itx->itx_lr.lrc_reclen = lrsize;
itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
+ itx->itx_sync = B_TRUE; /* default is synchronous */
return (itx);
}
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+void
+zil_itx_destroy(itx_t *itx)
{
- uint64_t seq;
+ kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
- ASSERT(itx->itx_lr.lrc_seq == 0);
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
+{
+ itx_t *itx;
+ list_t *list;
+ avl_tree_t *t;
+ void *cookie;
+ itx_async_node_t *ian;
+
+ list = &itxs->i_sync_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz += itx->itx_sod;
- itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
- itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
+ cookie = NULL;
+ t = &itxs->i_async_tree;
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list = &ian->ia_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ avl_destroy(t);
+
+ kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
+
+ if (o1 < o2)
+ return (-1);
+ if (o1 > o2)
+ return (1);
- return (seq);
+ return (0);
}
/*
- * Free up all in-memory intent log transactions that have now been synced.
+ * Remove all async itx with the given oid.
*/
static void
-zil_itx_clean(zilog_t *zilog)
+zil_remove_async(zilog_t *zilog, uint64_t oid)
{
- uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
- uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
list_t clean_list;
itx_t *itx;
+ ASSERT(oid != 0);
list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
- mutex_enter(&zilog->zl_lock);
- /* wait for a log writer to finish walking list */
- while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- }
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
- /*
- * Move the sync'd log transactions to a separate list so we can call
- * kmem_free without holding the zl_lock.
- *
- * There is no need to set zl_writer as we don't drop zl_lock here
- */
- while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
- itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
- list_insert_tail(&clean_list, itx);
- }
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
- /* destroy sync'd log transactions */
+ /*
+ * Locate the object node and append its list.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ ian = avl_find(t, &oid, &where);
+ if (ian != NULL)
+ list_move_tail(&clean_list, &ian->ia_list);
+ mutex_exit(&itxg->itxg_lock);
+ }
while ((itx = list_head(&clean_list)) != NULL) {
list_remove(&clean_list, itx);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
}
list_destroy(&clean_list);
}
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t txg;
+ itxg_t *itxg;
+ itxs_t *itxs, *clean = NULL;
+
+ /*
+ * Object ids can be re-instantiated in the next txg so
+ * remove any async transactions to avoid future leaks.
+ * This can happen if a fsync occurs on the re-instantiated
+ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+ * the new file data and flushes a write record for the old object.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+ zil_remove_async(zilog, itx->itx_oid);
+
+ /*
+ * Ensure the data of a renamed file is committed before the rename.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+ zil_async_to_sync(zilog, itx->itx_oid);
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+ txg = ZILTEST_TXG;
+ else
+ txg = dmu_tx_get_txg(tx);
+
+ itxg = &zilog->zl_itxg[txg & TXG_MASK];
+ mutex_enter(&itxg->itxg_lock);
+ itxs = itxg->itxg_itxs;
+ if (itxg->itxg_txg != txg) {
+ if (itxs != NULL) {
+ /*
+ * The zil_clean callback hasn't got around to cleaning
+ * this itxg. Save the itxs for release below.
+ * This should be rare.
+ */
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean = itxg->itxg_itxs;
+ }
+ ASSERT(itxg->itxg_sod == 0);
+ itxg->itxg_txg = txg;
+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+ list_create(&itxs->i_sync_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ avl_create(&itxs->i_async_tree, zil_aitx_compare,
+ sizeof (itx_async_node_t),
+ offsetof(itx_async_node_t, ia_node));
+ }
+ if (itx->itx_sync) {
+ list_insert_tail(&itxs->i_sync_list, itx);
+ atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
+ itxg->itxg_sod += itx->itx_sod;
+ } else {
+ avl_tree_t *t = &itxs->i_async_tree;
+ uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ itx_async_node_t *ian;
+ avl_index_t where;
+
+ ian = avl_find(t, &foid, &where);
+ if (ian == NULL) {
+ ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+ list_create(&ian->ia_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ ian->ia_foid = foid;
+ avl_insert(t, ian, where);
+ }
+ list_insert_tail(&ian->ia_list, itx);
+ }
+
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ mutex_exit(&itxg->itxg_lock);
+
+ /* Release the old itxs now we've dropped the lock */
+ if (clean != NULL)
+ zil_itxg_clean(clean);
+}
+
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
-zil_clean(zilog_t *zilog)
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
{
- itx_t *itx;
+ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+ itxs_t *clean_me;
- mutex_enter(&zilog->zl_lock);
- itx = list_head(&zilog->zl_itx_list);
- if ((itx != NULL) &&
- (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
- (void) taskq_dispatch(zilog->zl_clean_taskq,
- (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+ mutex_exit(&itxg->itxg_lock);
+ return;
+ }
+ ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+ ASSERT(itxg->itxg_txg != 0);
+ ASSERT(zilog->zl_clean_taskq != NULL);
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean_me = itxg->itxg_itxs;
+ itxg->itxg_itxs = NULL;
+ itxg->itxg_txg = 0;
+ mutex_exit(&itxg->itxg_lock);
+ /*
+ * Preferably start a task queue to free up the old itxs but
+ * if taskq_dispatch can't allocate resources to do that then
+ * free it in-line. This should be rare. Note, using TQ_SLEEP
+ * created a bad performance problem.
+ */
+ if (taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
+ zil_itxg_clean(clean_me);
+}
+
+/*
+ * Get the list of itxs to commit into zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+ uint64_t otxg, txg;
+ list_t *commit_list = &zilog->zl_itx_commit_list;
+ uint64_t push_sod = 0;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+ push_sod += itxg->itxg_sod;
+ itxg->itxg_sod = 0;
+
+ mutex_exit(&itxg->itxg_lock);
+ }
+ atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+static void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If a foid is specified then find that node and append its
+ * list. Otherwise walk the tree appending all the lists
+ * to the sync list. We add to the end rather than the
+ * beginning to ensure the create has happened.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ if (foid != 0) {
+ ian = avl_find(t, &foid, &where);
+ if (ian != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ }
+ } else {
+ void *cookie = NULL;
+
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ list_destroy(&ian->ia_list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ }
+ mutex_exit(&itxg->itxg_lock);
}
- mutex_exit(&zilog->zl_lock);
}
static void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit_writer(zilog_t *zilog)
{
uint64_t txg;
- uint64_t commit_seq = 0;
- itx_t *itx, *itx_next = (itx_t *)-1;
+ itx_t *itx;
lwb_t *lwb;
- spa_t *spa;
+ spa_t *spa = zilog->zl_spa;
+ int error = 0;
- zilog->zl_writer = B_TRUE;
ASSERT(zilog->zl_root_zio == NULL);
- spa = zilog->zl_spa;
+
+ mutex_exit(&zilog->zl_lock);
+
+ zil_get_commit_list(zilog);
+
+ /*
+ * Return if there's nothing to commit before we dirty the fs by
+ * calling zil_create().
+ */
+ if (list_head(&zilog->zl_itx_commit_list) == NULL) {
+ mutex_enter(&zilog->zl_lock);
+ return;
+ }
if (zilog->zl_suspend) {
lwb = NULL;
} else {
lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- /*
- * Return if there's nothing to flush before we
- * dirty the fs by calling zil_create()
- */
- if (list_is_empty(&zilog->zl_itx_list)) {
- zilog->zl_writer = B_FALSE;
- return;
- }
- mutex_exit(&zilog->zl_lock);
- zil_create(zilog);
- mutex_enter(&zilog->zl_lock);
- lwb = list_tail(&zilog->zl_lwb_list);
- }
+ if (lwb == NULL)
+ lwb = zil_create(zilog);
}
- /* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
- for (;;) {
- /*
- * Find the next itx to push:
- * Push all transactions related to specified foid and all
- * other transactions except TX_WRITE, TX_TRUNCATE,
- * TX_SETATTR and TX_ACL for all other files.
- */
- if (itx_next != (itx_t *)-1)
- itx = itx_next;
- else
- itx = list_head(&zilog->zl_itx_list);
- for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
- if (foid == 0) /* push all foids? */
- break;
- if (itx->itx_sync) /* push all O_[D]SYNC */
- break;
- switch (itx->itx_lr.lrc_txtype) {
- case TX_SETATTR:
- case TX_WRITE:
- case TX_TRUNCATE:
- case TX_ACL:
- /* lr_foid is same offset for these records */
- if (((lr_write_t *)&itx->itx_lr)->lr_foid
- != foid) {
- continue; /* skip this record */
- }
- }
- break;
- }
- if (itx == NULL)
- break;
-
- if ((itx->itx_lr.lrc_seq > seq) &&
- ((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
- break;
- }
-
- /*
- * Save the next pointer. Even though we soon drop
- * zl_lock all threads that may change the list
- * (another writer or zil_itx_clean) can't do so until
- * they have zl_writer.
- */
- itx_next = list_next(&zilog->zl_itx_list, itx);
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
- mutex_exit(&zilog->zl_lock);
+ while (itx = list_head(&zilog->zl_itx_commit_list)) {
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
- if (txg > spa_last_synced_txg(spa) ||
- txg > spa_freeze_txg(spa))
+ if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
+ list_remove(&zilog->zl_itx_commit_list, itx);
kmem_free(itx, offsetof(itx_t, itx_lr)
+ itx->itx_lr.lrc_reclen);
- mutex_enter(&zilog->zl_lock);
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
- /* determine commit sequence number */
- itx = list_head(&zilog->zl_itx_list);
- if (itx)
- commit_seq = itx->itx_lr.lrc_seq;
- else
- commit_seq = zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
/* write the last block out */
if (lwb != NULL && lwb->lwb_zio != NULL)
lwb = zil_lwb_write_start(zilog, lwb);
- zilog->zl_prev_used = zilog->zl_cur_used;
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
- DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
- (void) zio_wait(zilog->zl_root_zio);
+ error = zio_wait(zilog->zl_root_zio);
zilog->zl_root_zio = NULL;
- DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
zil_flush_vdevs(zilog);
}
- if (zilog->zl_log_error || lwb == NULL) {
- zilog->zl_log_error = 0;
+ if (error || lwb == NULL)
txg_wait_synced(zilog->zl_dmu_pool, 0);
- }
mutex_enter(&zilog->zl_lock);
- zilog->zl_writer = B_FALSE;
- ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
- zilog->zl_commit_seq = commit_seq;
+ /*
+ * Remember the highest committed log sequence number for ztest.
+ * We only update this value when all the log writes succeeded,
+ * because ztest wants to ASSERT that it got the whole log chain.
+ */
+ if (error == 0 && lwb != NULL)
+ zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
/*
- * Push zfs transactions to stable storage up to the supplied sequence number.
+ * Commit zfs transactions to stable storage.
* If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
+ * for that object or might reference that object.
+ *
+ * itxs are committed in batches. In a heavily stressed zil there will be
+ * a commit writer thread who is writing out a bunch of itxs to the log
+ * for a set of committing threads (cthreads) in the same batch as the writer.
+ * Those cthreads are all waiting on the same cv for that batch.
+ *
+ * There will also be a different and growing batch of threads that are
+ * waiting to commit (qthreads). When the committing batch completes
+ * a transition occurs such that the cthreads exit and the qthreads become
+ * cthreads. One of the new cthreads becomes the writer thread for the
+ * batch. Any new threads arriving become new qthreads.
+ *
+ * Only 2 condition variables are needed and there's no transition
+ * between the two cvs needed. They just flip-flop between qthreads
+ * and cthreads.
+ *
+ * Using this scheme we can efficiently wakeup up only those threads
+ * that have been committed.
*/
void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit(zilog_t *zilog, uint64_t foid)
{
- if (zilog == NULL || seq == 0)
- return;
+ uint64_t mybatch;
- mutex_enter(&zilog->zl_lock);
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return;
- seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+ /* move the async itxs for the foid to the sync queues */
+ zil_async_to_sync(zilog, foid);
+ mutex_enter(&zilog->zl_lock);
+ mybatch = zilog->zl_next_batch;
while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- if (seq < zilog->zl_commit_seq) {
+ cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
+ if (mybatch <= zilog->zl_com_batch) {
mutex_exit(&zilog->zl_lock);
return;
}
}
- zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
- /* wake up others waiting on the commit */
- cv_broadcast(&zilog->zl_cv_writer);
+
+ zilog->zl_next_batch++;
+ zilog->zl_writer = B_TRUE;
+ zil_commit_writer(zilog);
+ zilog->zl_com_batch = mybatch;
+ zilog->zl_writer = B_FALSE;
mutex_exit(&zilog->zl_lock);
+
+ /* wake up one thread to become the next writer */
+ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
+
+ /* wake up all threads waiting for this batch to be committed */
+ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
}
/*
@@ -1217,6 +1536,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
zil_header_t *zh = zil_header_in_syncing_context(zilog);
uint64_t txg = dmu_tx_get_txg(tx);
spa_t *spa = zilog->zl_spa;
+ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
lwb_t *lwb;
/*
@@ -1230,7 +1550,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
ASSERT(zilog->zl_stop_sync == 0);
- zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
+ if (*replayed_seq != 0) {
+ ASSERT(zh->zh_replay_seq < *replayed_seq);
+ zh->zh_replay_seq = *replayed_seq;
+ *replayed_seq = 0;
+ }
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
@@ -1259,7 +1583,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
- zio_free_blk(spa, &lwb->lwb_blk, txg);
+ zio_free_zil(spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
/*
@@ -1287,6 +1611,18 @@ zil_fini(void)
kmem_cache_destroy(zil_lwb_cache);
}
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+ zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+ zilog->zl_logbias = logbias;
+}
+
zilog_t *
zil_alloc(objset_t *os, zil_header_t *zh_phys)
{
@@ -1299,15 +1635,23 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_spa = dmu_objset_spa(os);
zilog->zl_dmu_pool = dmu_objset_pool(os);
zilog->zl_destroy_txg = TXG_INITIAL - 1;
+ zilog->zl_logbias = dmu_objset_logbias(os);
+ zilog->zl_sync = dmu_objset_syncprop(os);
+ zilog->zl_next_batch = 1;
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zilog->zl_itx_list, sizeof (itx_t),
- offsetof(itx_t, itx_node));
+ for (int i = 0; i < TXG_SIZE; i++) {
+ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
offsetof(lwb_t, lwb_node));
+ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
@@ -1315,6 +1659,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
return (zilog);
}
@@ -1322,27 +1668,47 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
void
zil_free(zilog_t *zilog)
{
- lwb_t *lwb;
+ lwb_t *head_lwb;
zilog->zl_stop_sync = 1;
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- list_remove(&zilog->zl_lwb_list, lwb);
- if (lwb->lwb_buf != NULL)
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- kmem_cache_free(zil_lwb_cache, lwb);
+ /*
+ * After zil_close() there should only be one lwb with a buffer.
+ */
+ head_lwb = list_head(&zilog->zl_lwb_list);
+ if (head_lwb) {
+ ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
+ list_remove(&zilog->zl_lwb_list, head_lwb);
+ zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, head_lwb);
}
list_destroy(&zilog->zl_lwb_list);
avl_destroy(&zilog->zl_vdev_tree);
mutex_destroy(&zilog->zl_vdev_lock);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
- list_destroy(&zilog->zl_itx_list);
+ ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+ list_destroy(&zilog->zl_itx_commit_list);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * It's possible for an itx to be generated that doesn't dirty
+ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+ * callback to remove the entry. We remove those here.
+ *
+ * Also free up the ziltest itxs.
+ */
+ if (zilog->zl_itxg[i].itxg_itxs)
+ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+ mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+ }
+
mutex_destroy(&zilog->zl_lock);
cv_destroy(&zilog->zl_cv_writer);
cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_cv_batch[0]);
+ cv_destroy(&zilog->zl_cv_batch[1]);
kmem_free(zilog, sizeof (zilog_t));
}
@@ -1368,26 +1734,28 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
void
zil_close(zilog_t *zilog)
{
+ lwb_t *tail_lwb;
+ uint64_t txg = 0;
+
+ zil_commit(zilog, 0); /* commit all itx */
+
/*
- * If the log isn't already committed, mark the objset dirty
- * (so zil_sync() will be called) and wait for that txg to sync.
+ * The lwb_max_txg for the stubby lwb will reflect the last activity
+ * for the zil. After a txg_wait_synced() on the txg we know all the
+ * callbacks have occurred that may clean the zil. Only then can we
+ * destroy the zl_clean_taskq.
*/
- if (!zil_is_committed(zilog)) {
- uint64_t txg;
- dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
+ mutex_enter(&zilog->zl_lock);
+ tail_lwb = list_tail(&zilog->zl_lwb_list);
+ if (tail_lwb != NULL)
+ txg = tail_lwb->lwb_max_txg;
+ mutex_exit(&zilog->zl_lock);
+ if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- }
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
zilog->zl_get_data = NULL;
-
- zil_itx_clean(zilog);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
}
/*
@@ -1419,15 +1787,7 @@ zil_suspend(zilog_t *zilog)
zilog->zl_suspending = B_TRUE;
mutex_exit(&zilog->zl_lock);
- zil_commit(zilog, UINT64_MAX, 0);
-
- /*
- * Wait for any in-flight log writes to complete.
- */
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- mutex_exit(&zilog->zl_lock);
+ zil_commit(zilog, 0);
zil_destroy(zilog, B_FALSE);
@@ -1448,102 +1808,89 @@ zil_resume(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
}
-/*
- * Read in the data for the dmu_sync()ed block, and change the log
- * record to write this whole block.
- */
-void
-zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
-{
- blkptr_t *wbp = &lr->lr_blkptr;
- char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t blksz;
-
- if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- /*
- * If the blksz is zero then we must be replaying a log
- * from an version prior to setting the blksize of null blocks.
- * So we just zero the actual write size reqeusted.
- */
- if (blksz == 0) {
- bzero(wbuf, lr->lr_length);
- return;
- }
- bzero(wbuf, blksz);
- } else {
- /*
- * A subsequent write may have overwritten this block, in which
- * case wbp may have been been freed and reallocated, and our
- * read of wbp may fail with a checksum error. We can safely
- * ignore this because the later write will provide the
- * correct data.
- */
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lr->lr_foid;
- zb.zb_level = 0;
- zb.zb_blkid = -1; /* unknown */
-
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
- NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
- }
- lr->lr_offset -= lr->lr_offset % blksz;
- lr->lr_length = blksz;
-}
-
typedef struct zil_replay_arg {
- objset_t *zr_os;
zil_replay_func_t **zr_replay;
void *zr_arg;
boolean_t zr_byteswap;
- char *zr_lrbuf;
+ char *zr_lr;
} zil_replay_arg_t;
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+ char name[MAXNAMELEN];
+
+ zilog->zl_replaying_seq--; /* didn't actually replay this one */
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+ (u_longlong_t)lr->lrc_seq,
+ (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+ return (error);
+}
+
+static int
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
zil_replay_arg_t *zr = zra;
const zil_header_t *zh = zilog->zl_header;
uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype;
- char *name;
- int pass, error;
+ int error = 0;
- if (!zilog->zl_replay) /* giving up */
- return;
-
- if (lr->lrc_txg < claim_txg) /* already committed */
- return;
+ zilog->zl_replaying_seq = lr->lrc_seq;
if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
- return;
+ return (0);
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return (0);
/* Strip case-insensitive bit, still present in log record */
txtype &= ~TX_CI;
- if (txtype == 0 || txtype >= TX_MAX_TYPE) {
- error = EINVAL;
- goto bad;
+ if (txtype == 0 || txtype >= TX_MAX_TYPE)
+ return (zil_replay_error(zilog, lr, EINVAL));
+
+ /*
+ * If this record type can be logged out of order, the object
+ * (lr_foid) may no longer exist. That's legitimate, not an error.
+ */
+ if (TX_OOO(txtype)) {
+ error = dmu_object_info(zilog->zl_os,
+ ((lr_ooo_t *)lr)->lr_foid, NULL);
+ if (error == ENOENT || error == EEXIST)
+ return (0);
}
/*
* Make a copy of the data so we can revise and extend it.
*/
- bcopy(lr, zr->zr_lrbuf, reclen);
+ bcopy(lr, zr->zr_lr, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ error = zil_read_log_data(zilog, (lr_write_t *)lr,
+ zr->zr_lr + reclen);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
+ }
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
- * However, the log is a mix of different data types, and only the
+ * However, the log is a mix of different record types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
- byteswap_uint64_array(zr->zr_lrbuf, reclen);
+ byteswap_uint64_array(zr->zr_lr, reclen);
/*
* We must now do two things atomically: replay this log record,
@@ -1551,42 +1898,30 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
- for (pass = 1; pass <= 2; pass++) {
- zilog->zl_replaying_seq = lr->lrc_seq;
- /* Only byteswap (if needed) on the 1st pass. */
- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
- zr->zr_byteswap && pass == 1);
-
- if (!error)
- return;
-
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+ if (error) {
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
- * any removes then retry the transaction.
+ * any removes then retry the transaction. Note that we
+ * specify B_FALSE for byteswap now, so we don't do it twice.
*/
- if (pass == 1)
- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
}
-
-bad:
- ASSERT(error);
- name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- dmu_objset_name(zr->zr_os, name);
- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
- "dataset %s, seq 0x%llx, txtype %llu %s\n",
- error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
- (lr->lrc_txtype & TX_CI) ? "CI" : "");
- zilog->zl_replay = B_FALSE;
- kmem_free(name, MAXNAMELEN);
+ return (0);
}
/* ARGSUSED */
-static void
+static int
zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zilog->zl_replay_blks++;
+
+ return (0);
}
/*
@@ -1605,11 +1940,10 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
}
//printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
- zr.zr_os = os;
zr.zr_replay = replay_func;
zr.zr_arg = arg;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
- zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+ zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
/*
* Wait for in-progress removes to sync before starting replay.
@@ -1617,11 +1951,11 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
txg_wait_synced(zilog->zl_dmu_pool, 0);
zilog->zl_replay = B_TRUE;
- zilog->zl_replay_time = LBOLT;
+ zilog->zl_replay_time = ddi_get_lbolt();
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
- kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+ kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
@@ -1629,58 +1963,31 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
}
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
{
- lwb_t *lwb;
- int ret;
-
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
- /* recent unpushed intent log transactions? */
- if (!list_is_empty(&zilog->zl_itx_list)) {
- ret = B_FALSE;
- goto out;
- }
-
- /* intent log never used? */
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- ret = B_TRUE;
- goto out;
- }
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return (B_TRUE);
- /*
- * more than 1 log buffer means zil_sync() hasn't yet freed
- * entries after a txg has committed
- */
- if (list_next(&zilog->zl_lwb_list, lwb)) {
- ret = B_FALSE;
- goto out;
+ if (zilog->zl_replay) {
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+ zilog->zl_replaying_seq;
+ return (B_TRUE);
}
- ASSERT(zil_empty(zilog));
- ret = B_TRUE;
-out:
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
- return (ret);
+ return (B_FALSE);
}
/* ARGSUSED */
int
-zil_vdev_offline(char *osname, void *arg)
+zil_vdev_offline(const char *osname, void *arg)
{
objset_t *os;
zilog_t *zilog;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error)
return (error);
@@ -1689,6 +1996,6 @@ zil_vdev_offline(char *osname, void *arg)
error = EEXIST;
else
zil_resume(zilog);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 32f15e90d4fa..5e968b5c29b9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -32,6 +31,9 @@
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -57,6 +59,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
6, /* ZIO_PRIORITY_ASYNC_READ */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
+ 2, /* ZIO_PRIORITY_DDT_PREFETCH */
};
/*
@@ -69,10 +72,6 @@ char *zio_type_name[ZIO_TYPES] = {
"zio_ioctl"
};
-#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
-#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
-#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
-
/*
* ==========================================================================
* I/O kmem caches
@@ -91,8 +90,15 @@ extern vmem_t *zio_alloc_arena;
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
-#define IO_IS_ALLOCATING(zio) \
- ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
void
zio_init(void)
@@ -113,6 +119,7 @@ zio_init(void)
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
size_t p2 = size;
size_t align = 0;
+ size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
while (p2 & (p2 - 1))
p2 &= p2 - 1;
@@ -129,11 +136,17 @@ zio_init(void)
char name[36];
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL, cflags);
+ /*
+ * Since zio_data bufs do not appear in crash dumps, we
+ * pass KMC_NOTOUCH so that no allocator metadata is
+ * stored with the buffers.
+ */
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL,
+ cflags | KMC_NOTOUCH);
}
}
@@ -280,7 +293,8 @@ zio_pop_transforms(zio_t *zio)
zt->zt_transform(zio,
zt->zt_orig_data, zt->zt_orig_size);
- zio_buf_free(zio->io_data, zt->zt_bufsize);
+ if (zt->zt_bufsize != 0)
+ zio_buf_free(zio->io_data, zt->zt_bufsize);
zio->io_data = zt->zt_orig_data;
zio->io_size = zt->zt_orig_size;
@@ -309,7 +323,7 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
{
if (zio->io_error == 0 &&
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
- zio->io_data, zio->io_size, data, size) != 0)
+ zio->io_data, data, zio->io_size, size) != 0)
zio->io_error = EIO;
}
@@ -394,6 +408,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
+ pio->io_child_count++;
+ cio->io_parent_count++;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
}
@@ -410,6 +427,9 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
@@ -425,7 +445,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
mutex_enter(&zio->io_lock);
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
- zio->io_stage--;
+ zio->io_stage >>= 1;
zio->io_stall = countp;
waiting = B_TRUE;
}
@@ -467,10 +487,11 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
* ==========================================================================
*/
static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
- const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+ zio_type_t type, int priority, enum zio_flag flags,
+ vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+ enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
@@ -497,14 +518,17 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
zio->io_child_type = ZIO_CHILD_GANG;
+ else if (flags & ZIO_FLAG_DDT_CHILD)
+ zio->io_child_type = ZIO_CHILD_DDT;
else
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
- zio->io_bp = bp;
+ zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- if (type != ZIO_TYPE_WRITE)
+ if (type != ZIO_TYPE_WRITE ||
+ zio->io_child_type == ZIO_CHILD_DDT)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
@@ -514,14 +538,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_spa = spa;
zio->io_txg = txg;
- zio->io_data = data;
- zio->io_size = size;
zio->io_done = done;
zio->io_private = private;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
+ zio->io_orig_data = zio->io_data = data;
+ zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -555,7 +579,7 @@ zio_destroy(zio_t *zio)
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
- void *private, int flags)
+ void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -567,7 +591,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
}
zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
{
return (zio_null(NULL, spa, NULL, done, private, flags));
}
@@ -575,23 +599,24 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
- zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
data, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
return (zio);
}
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
@@ -601,13 +626,15 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
zp->zp_type < DMU_OT_NUMTYPES &&
zp->zp_level < 32 &&
- zp->zp_ndvas > 0 &&
- zp->zp_ndvas <= spa_max_replication(spa));
- ASSERT(ready != NULL);
+ zp->zp_copies > 0 &&
+ zp->zp_copies <= spa_max_replication(spa) &&
+ zp->zp_dedup <= 1 &&
+ zp->zp_dedup_verify <= 1);
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
zio->io_ready = ready;
zio->io_prop = *zp;
@@ -618,7 +645,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ enum zio_flag flags, zbookmark_t *zb)
{
zio_t *zio;
@@ -629,33 +656,47 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
return (zio);
}
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+ zio->io_prop.zp_copies = copies;
+ zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ enum zio_flag flags)
{
zio_t *zio;
- ASSERT(!BP_IS_HOLE(bp));
-
- if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
+ dprintf_bp(bp, "freeing in txg %llu, pass %u",
+ (longlong_t)txg, spa->spa_sync_pass);
- if (txg == spa->spa_syncing_txg &&
- spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
- }
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(spa_syncing_txg(spa) == txg);
+ ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
return (zio);
}
zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -669,9 +710,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
+ * If txg == 0 we just verify that the block is claimable.
*/
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
- ASSERT3U(spa_first_txg(spa), <=, txg);
+ ASSERT(txg == spa_first_txg(spa) || txg == 0);
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -682,7 +725,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags)
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
{
zio_t *zio;
int c;
@@ -707,7 +750,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -728,7 +771,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -743,9 +786,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio->io_prop.zp_checksum = checksum;
- if (zio_checksum_table[checksum].ci_zbt) {
+ if (zio_checksum_table[checksum].ci_eck) {
/*
- * zbt checksums are necessarily destructive -- they modify
+ * zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
@@ -763,10 +806,10 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority, int flags,
+ void *data, uint64_t size, int type, int priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
- uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
ASSERT(vd->vdev_parent ==
@@ -779,26 +822,33 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
- pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
- pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
if (vd->vdev_children == 0)
offset += VDEV_LABEL_START_SIZE;
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+ /*
+ * If we've decided to do a repair, the write is not speculative --
+ * even if the original read was.
+ */
+ if (flags & ZIO_FLAG_IO_REPAIR)
+ flags &= ~ZIO_FLAG_SPECULATIVE;
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
- done, private, type, priority,
- (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
- vd, offset, &pio->io_bookmark,
- ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+ done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+ ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
- int type, int priority, int flags, zio_done_func_t *done, void *private)
+ int type, int priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -808,7 +858,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
data, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
vd, offset, NULL,
- ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
return (zio);
}
@@ -821,6 +871,23 @@ zio_flush(zio_t *zio, vdev_t *vd)
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+ ASSERT(zio->io_executor == NULL);
+ ASSERT(zio->io_orig_size == zio->io_size);
+ ASSERT(size <= zio->io_size);
+
+ /*
+ * We don't shrink for raidz because of problems with the
+ * reconstruction when reading back less than the block size.
+ * Note, BP_IS_RAIDZ() assumes no compression.
+ */
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ if (!BP_IS_RAIDZ(zio->io_bp))
+ zio->io_orig_size = zio->io_size = size;
+}
+
/*
* ==========================================================================
* Prepare to read and write logical blocks
@@ -835,28 +902,33 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t csize = BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(csize);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(psize);
- zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+ zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_bp_init(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_prop_t *zp = &zio->io_prop;
- int compress = zp->zp_compress;
+ enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
- void *cbuf;
uint64_t lsize = zio->io_size;
- uint64_t csize = lsize;
- uint64_t cbufsize = 0;
+ uint64_t psize = lsize;
int pass = 1;
/*
@@ -870,7 +942,29 @@ zio_write_bp_init(zio_t *zio)
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
- ASSERT(compress != ZIO_COMPRESS_INHERIT);
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+ if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth != zio->io_txg);
+ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+ *bp = *zio->io_bp_override;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+ zp->zp_dedup_verify);
+
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+ BP_SET_DEDUP(bp, 1);
+ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ }
if (bp->blk_birth == zio->io_txg) {
/*
@@ -882,22 +976,29 @@ zio_write_bp_init(zio_t *zio)
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
- pass = spa_sync_pass(zio->io_spa);
+ pass = spa_sync_pass(spa);
+
+ ASSERT(zio->io_txg == spa_syncing_txg(spa));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!BP_GET_DEDUP(bp));
if (pass > SYNC_PASS_DONT_COMPRESS)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
- spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+ ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
if (compress != ZIO_COMPRESS_OFF) {
- if (!zio_compress_data(compress, zio->io_data, zio->io_size,
- &cbuf, &csize, &cbufsize)) {
+ void *cbuf = zio_buf_alloc(lsize);
+ psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
- } else if (csize != 0) {
- zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+ zio_buf_free(cbuf, lsize);
+ } else {
+ ASSERT(psize < lsize);
+ zio_push_transform(zio, cbuf, psize, lsize, NULL);
}
}
@@ -909,10 +1010,10 @@ zio_write_bp_init(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
pass > SYNC_PASS_REWRITE) {
- ASSERT(csize != 0);
- uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+ ASSERT(psize != 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -920,17 +1021,36 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline = ZIO_WRITE_PIPELINE;
}
- if (csize == 0) {
+ if (psize == 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
- BP_SET_PSIZE(bp, csize);
+ BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ if (zp->zp_dedup) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ if (BP_GET_DEDUP(bp))
+ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
}
return (ZIO_PIPELINE_CONTINUE);
@@ -943,10 +1063,11 @@ zio_write_bp_init(zio_t *zio)
*/
static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
+ int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
#ifdef _KERNEL
struct ostask *task;
#endif
@@ -984,10 +1105,10 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
#ifdef _KERNEL
(void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, task);
+ (task_func_t *)zio_execute, zio, flags, task);
#else
(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, TQ_SLEEP);
+ (task_func_t *)zio_execute, zio, flags);
#endif
}
@@ -1007,7 +1128,7 @@ zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
static int
zio_issue_async(zio_t *zio)
{
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
@@ -1015,7 +1136,7 @@ zio_issue_async(zio_t *zio)
void
zio_interrupt(zio_t *zio)
{
- zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
/*
@@ -1031,7 +1152,7 @@ zio_interrupt(zio_t *zio)
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
void
zio_execute(zio_t *zio)
@@ -1039,32 +1160,39 @@ zio_execute(zio_t *zio)
zio->io_executor = curthread;
while (zio->io_stage < ZIO_STAGE_DONE) {
- uint32_t pipeline = zio->io_pipeline;
- zio_stage_t stage = zio->io_stage;
+ enum zio_stage pipeline = zio->io_pipeline;
+ enum zio_stage stage = zio->io_stage;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(ISP2(stage));
+ ASSERT(zio->io_stall == NULL);
- while (((1U << ++stage) & pipeline) == 0)
- continue;
+ do {
+ stage <<= 1;
+ } while ((stage & pipeline) == 0);
ASSERT(stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stall == NULL);
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
- * issue async to avoid deadlock.
+ * or may wait for an I/O that needs an interrupt thread
+ * to complete, issue async to avoid deadlock.
+ *
+ * For VDEV_IO_START, we cut in line so that the io will
+ * be sent to disk promptly.
*/
- if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
- zio->io_vd == NULL &&
+ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
return;
}
zio->io_stage = stage;
- rv = zio_pipeline[stage](zio);
+ rv = zio_pipeline[highbit(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
return;
@@ -1147,19 +1275,8 @@ zio_reexecute(zio_t *pio)
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
- if (IO_IS_ALLOCATING(pio)) {
- /*
- * Remember the failed bp so that the io_ready() callback
- * can update its accounting upon reexecution. The block
- * was already freed in zio_done(); we indicate this with
- * a fill count of -1 so that zio_free() knows to skip it.
- */
- blkptr_t *bp = pio->io_bp;
- ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
- bp->blk_fill = BLK_FILL_ALREADY_FREED;
- pio->io_bp_orig = *bp;
- BP_ZERO(bp);
- }
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
/*
* As we reexecute pio's children, new children could be created.
@@ -1347,6 +1464,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp));
}
+ /*
+ * If we are here to damage data for testing purposes,
+ * leave the GBH alone so that we can detect the damage.
+ */
+ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1360,8 +1483,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
{
- return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
- NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+ return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
@@ -1445,7 +1568,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
@@ -1455,7 +1578,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
ASSERT(zio->io_data == gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1482,7 +1605,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1551,9 +1674,9 @@ zio_write_gang_member_ready(zio_t *zio)
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
- ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
- ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
@@ -1578,13 +1701,13 @@ zio_write_gang_block(zio_t *pio)
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
- int ndvas = gio->io_prop.zp_ndvas;
- int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
int error;
- error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+ error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
if (error) {
pio->io_error = error;
@@ -1620,7 +1743,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
- zp.zp_ndvas = gio->io_prop.zp_ndvas;
+ zp.zp_copies = gio->io_prop.zp_copies;
+ zp.zp_dedup = 0;
+ zp.zp_dedup_verify = 0;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1641,15 +1766,383 @@ zio_write_gang_block(zio_t *pio)
/*
* ==========================================================================
- * Allocate and free blocks
+ * Dedup
* ==========================================================================
*/
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+ if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+ dde->dde_repair_data = zio->io_data;
+ else
+ zio_buf_free(zio->io_data, zio->io_size);
+ mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio_ddt_child_read_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+ &zio->io_bookmark));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
static int
+zio_ddt_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ return (ZIO_PIPELINE_STOP);
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ return (ZIO_PIPELINE_STOP);
+ }
+ if (dde->dde_repair_data != NULL) {
+ bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ */
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ bcmp(zio->io_orig_data, lio->io_orig_data,
+ zio->io_orig_size) != 0);
+ }
+ }
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ uint32_t aflags = ARC_WAIT;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ ddt_exit(ddt);
+
+ error = arc_read_nolock(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (arc_buf_size(abuf) != zio->io_orig_size ||
+ bcmp(abuf->b_data, zio->io_orig_data,
+ zio->io_orig_size) != 0)
+ error = EEXIST;
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ while ((pio = zio_walk_parents(zio)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ while (zio_walk_parents(zio) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+ int p = DDT_PHYS_DITTO;
+ zio_prop_t *zp = &zio->io_prop;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_key_t *ddk = &dde->dde_key;
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ if (ddp->ddp_phys_birth != 0)
+ ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ ddt_phys_fill(ddp, bp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ int ditto_copies;
+ zio_t *cio = NULL;
+ zio_t *dio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = 0;
+ }
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+ if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ zio_prop_t czp = *zp;
+
+ czp.zp_copies = ditto_copies;
+
+ /*
+ * If we arrived here with an override bp, we won't have run
+ * the transform stack, so we won't have the data we need to
+ * generate a child i/o. So, toss the override bp and restart.
+ * This is safe, because using the override bp is just an
+ * optimization; and it's rare, so the cost doesn't matter.
+ */
+ if (zio->io_bp_override) {
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, &czp, NULL,
+ zio_ddt_ditto_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ if (cio)
+ zio_nowait(cio);
+ if (dio)
+ zio_nowait(dio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ ddt_exit(ddt);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa->spa_normal_class;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
@@ -1660,12 +2153,12 @@ zio_dva_allocate(zio_t *zio)
ASSERT(BP_IS_HOLE(bp));
ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+ ASSERT3U(zio->io_prop.zp_copies, >, 0);
+ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
if (error) {
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1704,36 +2197,11 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- spa_t *spa = zio->io_spa;
- boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
- if (zio->io_bp == bp && !now) {
- /*
- * This is a rewrite for sync-to-convergence.
- * We can't do a metaslab_free(NOW) because bp wasn't allocated
- * during this sync pass, which means that metaslab_sync()
- * already committed the allocation.
- */
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
- BP_IDENTITY(&zio->io_bp_orig)));
- ASSERT(spa_sync_pass(spa) > 1);
-
- if (BP_IS_GANG(bp) && gn == NULL) {
- /*
- * This is a gang leader whose gang header(s) we
- * couldn't read now, so defer the free until later.
- * The block should still be intact because without
- * the headers, we'd never even start the rewrite.
- */
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return;
- }
- }
+ ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
- metaslab_free(spa, bp, bp->blk_birth, now);
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1747,25 +2215,31 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t txg)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t size, boolean_t use_slog)
{
- int error;
+ int error = 1;
- error = metaslab_alloc(spa, spa->spa_log_class, size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ ASSERT(txg > spa_syncing_txg(spa));
+
+ if (use_slog)
+ error = metaslab_alloc(spa, spa_log_class(spa), size,
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error)
- error = metaslab_alloc(spa, spa->spa_normal_class, size,
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_CHECKSUM(new_bp,
+ spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_LEVEL(new_bp, 0);
+ BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
}
@@ -1773,15 +2247,15 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
}
/*
- * Free an intent log block. We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
*/
void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
{
+ ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
ASSERT(!BP_IS_GANG(bp));
- metaslab_free(spa, bp, txg, B_FALSE);
+ zio_free(spa, txg, bp);
}
/*
@@ -1809,6 +2283,26 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ /*
+ * We keep track of time-sensitive I/Os so that the scan thread
+ * can quickly react to certain workloads. In particular, we care
+ * about non-scrubbing, top-level reads and writes with the following
+ * characteristics:
+ * - synchronous writes of user data to non-slog devices
+ * - any reads of user data
+ * When these conditions are met, adjust the timestamp of spa_last_io
+ * which allows the scan thread to adjust its workload accordingly.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+ vd == vd->vdev_top && !vd->vdev_islog &&
+ zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+ zio->io_txg != spa_syncing_txg(spa)) {
+ uint64_t old = spa->spa_last_io;
+ uint64_t new = ddi_get_lbolt64();
+ if (old != new)
+ (void) atomic_cas_64(&spa->spa_last_io, old, new);
+ }
+
align = 1ULL << vd->vdev_top->vdev_ashift;
if (P2PHASE(zio->io_size, align) != 0) {
@@ -1824,7 +2318,7 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -1910,6 +2404,32 @@ zio_vdev_io_done(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+ const void *good_buf)
+{
+ /* no processing needed */
+ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+ void *buf = zio_buf_alloc(zio->io_size);
+
+ bcopy(zio->io_data, buf, zio->io_size);
+
+ zcr->zcr_cbinfo = zio->io_size;
+ zcr->zcr_cbdata = buf;
+ zcr->zcr_finish = zio_vsd_default_cksum_finish;
+ zcr->zcr_free = zio_buf_free;
+}
+
static int
zio_vdev_io_assess(zio_t *zio)
{
@@ -1922,7 +2442,7 @@ zio_vdev_io_assess(zio_t *zio)
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
if (zio->io_vsd != NULL) {
- zio->io_vsd_free(zio);
+ zio->io_vsd_ops->vsd_free(zio);
zio->io_vsd = NULL;
}
@@ -1931,6 +2451,9 @@ zio_vdev_io_assess(zio_t *zio)
/*
* If the I/O failed, determine whether we should attempt to retry it.
+ *
+ * On retry, we cut in line in the issue queue, since we don't want
+ * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
if (zio->io_error && vd == NULL &&
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
@@ -1939,8 +2462,9 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
- zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+ zio_requeue_io_start_cut_in_line);
return (ZIO_PIPELINE_STOP);
}
@@ -1972,7 +2496,7 @@ zio_vdev_io_reissue(zio_t *zio)
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
ASSERT(zio->io_error == 0);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1980,7 +2504,7 @@ zio_vdev_io_redone(zio_t *zio)
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1990,7 +2514,7 @@ zio_vdev_io_bypass(zio_t *zio)
ASSERT(zio->io_error == 0);
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
- zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
}
/*
@@ -2032,9 +2556,12 @@ zio_checksum_generate(zio_t *zio)
static int
zio_checksum_verify(zio_t *zio)
{
+ zio_bad_cksum_t info;
blkptr_t *bp = zio->io_bp;
int error;
+ ASSERT(zio->io_vd != NULL);
+
if (bp == NULL) {
/*
* This is zio_read_phys().
@@ -2046,11 +2573,12 @@ zio_checksum_verify(zio_t *zio)
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
}
- if ((error = zio_checksum_error(zio)) != 0) {
+ if ((error = zio_checksum_error(zio, &info)) != 0) {
zio->io_error = error;
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, zio->io_vd, zio, 0, 0);
+ zfs_ereport_start_checksum(zio->io_spa,
+ zio->io_vd, zio, zio->io_offset,
+ zio->io_size, NULL, &info);
}
}
@@ -2063,7 +2591,7 @@ zio_checksum_verify(zio_t *zio)
void
zio_checksum_verified(zio_t *zio)
{
- zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/*
@@ -2103,7 +2631,8 @@ zio_ready(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
return (ZIO_PIPELINE_STOP);
if (zio->io_ready) {
@@ -2137,6 +2666,19 @@ zio_ready(zio_t *zio)
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (BP_IS_GANG(bp)) {
+ zio->io_flags &= ~ZIO_FLAG_NODATA;
+ } else {
+ ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ }
+ }
+
+ if (zio_injection_enabled &&
+ zio->io_spa->spa_syncing_txg == zio->io_txg)
+ zio_handle_ignored_writes(zio);
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -2156,6 +2698,7 @@ zio_done(zio_t *zio)
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
@@ -2166,23 +2709,51 @@ zio_done(zio_t *zio)
if (bp != NULL) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
- ASSERT(bp->blk_pad[2] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
(bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ zio->io_bp_override == NULL &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp));
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
}
/*
- * If there were child vdev or gang errors, they apply to us now.
+ * If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+ zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+ /*
+ * If the I/O on the transformed data was successful, generate any
+ * checksum reports now while we still have the transformed data.
+ */
+ if (zio->io_error == 0) {
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ uint64_t align = zcr->zcr_align;
+ uint64_t asize = P2ROUNDUP(psize, align);
+ char *abuf = zio->io_data;
+
+ if (asize != psize) {
+ abuf = zio_buf_alloc(asize);
+ bcopy(zio->io_data, abuf, psize);
+ bzero(abuf + psize, asize - psize);
+ }
+
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, abuf);
+ zfs_ereport_free_checksum(zcr);
+
+ if (asize != psize)
+ zio_buf_free(abuf, asize);
+ }
+ }
zio_pop_transforms(zio); /* note: may set zio->io_error */
@@ -2198,8 +2769,9 @@ zio_done(zio_t *zio)
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
- if ((zio->io_error == EIO ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+ if ((zio->io_error == EIO || !(zio->io_flags &
+ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
@@ -2216,22 +2788,34 @@ zio_done(zio_t *zio)
* propagate all the way to the root via zio_notify_parent().
*/
ASSERT(vd == NULL && bp != NULL);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
- if (IO_IS_ALLOCATING(zio))
+ if (IO_IS_ALLOCATING(zio) &&
+ !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ }
if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) &&
+ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
zio->io_error == ENXIO &&
- spa->spa_load_state == SPA_LOAD_NONE &&
+ spa_load_state(spa) == SPA_LOAD_NONE &&
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ /*
+ * Here is a possibly good place to attempt to do
+ * either combinatorial reconstruction or error correction
+ * based on checksums. It also might be a good place
+ * to send out preliminary ereports before we suspend
+ * processing.
+ */
}
/*
@@ -2242,11 +2826,10 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
- zio->io_child_type == ZIO_CHILD_LOGICAL) {
- ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+ if ((zio->io_error || zio->io_reexecute) &&
+ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+ !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
- }
zio_gang_tree_free(&zio->io_gang_tree);
@@ -2320,22 +2903,33 @@ zio_done(zio_t *zio)
#ifdef _KERNEL
(void) taskq_dispatch_safe(
spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio,
- &zio->io_task_issue);
+ (task_func_t *)zio_reexecute, zio, TQ_SLEEP,
+ &zio->io_task_issue);
#else
(void) taskq_dispatch(
spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+ (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
#endif
}
return (ZIO_PIPELINE_STOP);
}
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
+ * Report any checksum errors, since the I/O is complete.
+ */
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, NULL);
+ zfs_ereport_free_checksum(zcr);
+ }
+
+ /*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
@@ -2371,12 +2965,17 @@ zio_done(zio_t *zio)
* I/O pipeline definition
* ==========================================================================
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
- zio_issue_async,
zio_read_bp_init,
+ zio_free_bp_init,
+ zio_issue_async,
zio_write_bp_init,
zio_checksum_generate,
+ zio_ddt_read_start,
+ zio_ddt_read_done,
+ zio_ddt_write,
+ zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_allocate,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
index bf7fe733fe0c..c8fe20f2eb4e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -19,14 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <zfs_fletcher.h>
/*
* Checksum vectors.
@@ -49,13 +50,13 @@
* we want the ability to take advantage of that hardware.
*
* Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
*
* When writing a block, we always checksum it with the latest-and-greatest
* checksum function of the appropriate strength. When reading a block,
* we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
*/
/*ARGSUSED*/
@@ -66,19 +67,20 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+ {{NULL, NULL}, 0, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
};
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
{
ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +95,29 @@ zio_checksum_select(uint8_t child, uint8_t parent)
return (child);
}
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+ enum zio_checksum parent)
+{
+ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (spa_dedup_checksum(spa));
+
+ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+ ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+ return (child);
+}
+
/*
* Set the external verifier for a gang block based on <vdev, offset, txg>,
* a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +126,7 @@ static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
{
dva_t *dva = BP_IDENTITY(bp);
- uint64_t txg = bp->blk_birth;
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
@@ -128,47 +153,79 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t zbt_cksum;
+ zio_cksum_t cksum;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+
+ size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+ uint64_t);
+ eck = &zilc->zc_eck;
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
- zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+ zio_checksum_gang_verifier(&eck->zec_cksum, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
- zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+ zio_checksum_label_verifier(&eck->zec_cksum, offset);
else
- bp->blk_cksum = zbt->zbt_cksum;
- zbt->zbt_magic = ZBT_MAGIC;
- ci->ci_func[0](data, size, &zbt_cksum);
- zbt->zbt_cksum = zbt_cksum;
+ bp->blk_cksum = eck->zec_cksum;
+ eck->zec_magic = ZEC_MAGIC;
+ ci->ci_func[0](data, size, &cksum);
+ eck->zec_cksum = cksum;
} else {
ci->ci_func[0](data, size, &bp->blk_cksum);
}
}
int
-zio_checksum_error(zio_t *zio)
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int byteswap;
- void *data = zio->io_data;
+ int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ void *data = zio->io_data;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (EINVAL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+ uint64_t nused;
+
+ eck = &zilc->zc_eck;
+ if (eck->zec_magic == ZEC_MAGIC)
+ nused = zilc->zc_nused;
+ else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+ nused = BSWAP_64(zilc->zc_nused);
+ else
+ return (ECKSUM);
+
+ if (nused > size)
+ return (ECKSUM);
+
+ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
+
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
zio_checksum_gang_verifier(&verifier, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -176,15 +233,15 @@ zio_checksum_error(zio_t *zio)
else
verifier = bp->blk_cksum;
- byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = verifier;
+ expected_cksum = eck->zec_cksum;
+ eck->zec_cksum = verifier;
ci->ci_func[byteswap](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
+ eck->zec_cksum = expected_cksum;
if (byteswap)
byteswap_uint64_array(&expected_cksum,
@@ -196,11 +253,22 @@ zio_checksum_error(zio_t *zio)
ci->ci_func[byteswap](data, size, &actual_cksum);
}
+ info->zbc_expected = expected_cksum;
+ info->zbc_actual = actual_cksum;
+ info->zbc_checksum_name = ci->ci_name;
+ info->zbc_byteswapped = byteswap;
+ info->zbc_injected = 0;
+ info->zbc_has_cksum = 1;
+
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
return (ECKSUM);
- if (zio_injection_enabled && !zio->io_error)
- return (zio_handle_fault_injection(zio, ECKSUM));
+ if (zio_injection_enabled && !zio->io_error &&
+ (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
+
+ info->zbc_injected = 1;
+ return (error);
+ }
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
index c563be4eb955..f148977c4468 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/compress.h>
#include <sys/spa.h>
@@ -51,10 +49,11 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{gzip_compress, gzip_decompress, 7, "gzip-7"},
{gzip_compress, gzip_decompress, 8, "gzip-8"},
{gzip_compress, gzip_decompress, 9, "gzip-9"},
+ {zle_compress, zle_decompress, 64, "zle"},
};
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
{
ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@ zio_compress_select(uint8_t child, uint8_t parent)
return (child);
}
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
- uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- uint64_t ciosize, gapsize, destbufsize;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- char *dest;
- uint_t allzero;
+ size_t c_len, d_len, r_len;
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
- ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
/*
* If the data is all zeroes, we don't even need to allocate
- * a block for it. We indicate this by setting *destsizep = 0.
+ * a block for it. We indicate this by returning zero size.
*/
- allzero = 1;
- word = src;
- word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
- while (word < word_end) {
- if (*word++ != 0) {
- allzero = 0;
+ word_end = (uint64_t *)((char *)src + s_len);
+ for (word = src; word < word_end; word++)
+ if (*word != 0)
break;
- }
- }
- if (allzero) {
- *destp = NULL;
- *destsizep = 0;
- *destbufsizep = 0;
- return (1);
- }
- if (cpfunc == ZIO_COMPRESS_EMPTY)
+ if (word == word_end)
return (0);
+ if (c == ZIO_COMPRESS_EMPTY)
+ return (s_len);
+
/* Compress at least 12.5% */
- destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
- if (destbufsize == 0)
- return (0);
- dest = zio_buf_alloc(destbufsize);
+ d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+ if (d_len == 0)
+ return (s_len);
- ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
- (size_t)destbufsize, ci->ci_level);
- if (ciosize > destbufsize) {
- zio_buf_free(dest, destbufsize);
- return (0);
- }
+ c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
- /* Cool. We compressed at least as much as we were hoping to. */
+ if (c_len > d_len)
+ return (s_len);
- /* For security, make sure we don't write random heap crap to disk */
- gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
- if (gapsize != 0) {
- bzero(dest + ciosize, gapsize);
- ciosize += gapsize;
+ /*
+ * Cool. We compressed at least as much as we were hoping to.
+ * For both security and repeatability, pad out the last sector.
+ */
+ r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+ if (r_len > c_len) {
+ bzero((char *)dst + c_len, r_len - c_len);
+ c_len = r_len;
}
- ASSERT3U(ciosize, <=, destbufsize);
- ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
- *destp = dest;
- *destsizep = ciosize;
- *destbufsizep = destbufsize;
+ ASSERT3U(c_len, <=, d_len);
+ ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
- return (1);
+ return (c_len);
}
int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len)
{
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+ return (EINVAL);
- return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
index f8e6880c90f7..9ae7d1f697fd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -43,8 +42,8 @@
#include <sys/arc.h>
#include <sys/zio_impl.h>
#include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
#include <sys/fs/zfs.h>
uint32_t zio_injection_enabled;
@@ -70,8 +69,9 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
/*
* Check for a match against the MOS, which is based on type
*/
- if (zb->zb_objset == 0 && record->zi_objset == 0 &&
- record->zi_object == 0) {
+ if (zb->zb_objset == DMU_META_OBJSET &&
+ record->zi_objset == DMU_META_OBJSET &&
+ record->zi_object == DMU_META_DNODE_OBJECT) {
if (record->zi_type == DMU_OT_NONE ||
type == record->zi_type)
return (record->zi_freq == 0 ||
@@ -96,6 +96,31 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
}
/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_type == type &&
+ strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
*/
@@ -126,8 +151,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_spa != handler->zi_spa)
continue;
- /* Ignore device errors */
- if (handler->zi_record.zi_guid != 0)
+ /* Ignore device errors and panic injection */
+ if (handler->zi_record.zi_guid != 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
/* If this handler matches, return EIO */
@@ -159,7 +186,7 @@ zio_handle_label_injection(zio_t *zio, int error)
int label;
int ret = 0;
- if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+ if (offset >= VDEV_LABEL_START_SIZE &&
offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
return (0);
@@ -170,8 +197,10 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end;
- /* Ignore device only faults */
- if (handler->zi_record.zi_start == 0)
+ /* Ignore device only faults or panic injection */
+ if (handler->zi_record.zi_start == 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
/*
@@ -200,13 +229,30 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
inject_handler_t *handler;
int ret = 0;
+ /*
+ * We skip over faults in the labels unless it's during
+ * device open (i.e. zio == NULL).
+ */
+ if (zio != NULL) {
+ uint64_t offset = zio->io_offset;
+
+ if (offset < VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+ }
+
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore label specific faults */
- if (handler->zi_record.zi_start != 0)
+ /*
+ * Ignore label specific faults, panic injection
+ * or fake writes
+ */
+ if (handler->zi_record.zi_start != 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -216,6 +262,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
continue;
}
+ /* Handle type specific I/O failures */
+ if (zio != NULL &&
+ handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
if (handler->zi_record.zi_error == error) {
/*
* For a failed open, pretend like the device
@@ -224,6 +276,16 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
if (error == ENXIO)
vd->vdev_stat.vs_aux =
VDEV_AUX_OPEN_FAILED;
+
+ /*
+ * Treat these errors as if they had been
+ * retried so that all the appropriate stats
+ * and FMA events are generated.
+ */
+ if (!handler->zi_record.zi_failfast &&
+ zio != NULL)
+ zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
ret = error;
break;
}
@@ -240,6 +302,84 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
}
/*
+ * Simulate hardware that ignores cache flushes. For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_duration == 0)
+ continue;
+
+ /*
+ * Positive duration implies # of seconds, negative
+ * a number of txgs
+ */
+ if (handler->zi_record.zi_timer == 0) {
+ if (handler->zi_record.zi_duration > 0)
+ handler->zi_record.zi_timer = ddi_get_lbolt64();
+ else
+ handler->zi_record.zi_timer = zio->io_txg;
+ }
+
+ /* Have a "problem" writing 60% of the time */
+ if (spa_get_random(100) < 60)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ break;
+ }
+
+ rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+ inject_handler_t *handler;
+
+ if (zio_injection_enabled == 0)
+ return;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_duration == 0)
+ continue;
+
+ if (handler->zi_record.zi_duration > 0) {
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer +
+ handler->zi_record.zi_duration * hz >
+ ddi_get_lbolt64());
+ } else {
+ /* duration is negative so the subtraction here adds */
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer -
+ handler->zi_record.zi_duration >=
+ spa_syncing_txg(spa));
+ }
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,
* which is the switch to trigger all fault injection.
@@ -336,7 +476,6 @@ int
zio_clear_fault(int id)
{
inject_handler_t *handler;
- int ret;
rw_enter(&inject_lock, RW_WRITER);
@@ -346,18 +485,18 @@ zio_clear_fault(int id)
break;
if (handler == NULL) {
- ret = ENOENT;
- } else {
- list_remove(&inject_handlers, handler);
- spa_inject_delref(handler->zi_spa);
- kmem_free(handler, sizeof (inject_handler_t));
- atomic_add_32(&zio_injection_enabled, -1);
- ret = 0;
+ rw_exit(&inject_lock);
+ return (ENOENT);
}
+ list_remove(&inject_handlers, handler);
rw_exit(&inject_lock);
- return (ret);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+
+ return (0);
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
new file mode 100644
index 000000000000..13c5673fbe26
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end - 1) {
+ uchar_t *first = src;
+ uchar_t *len = dst++;
+ if (src[0] == 0) {
+ uchar_t *last = src + (256 - n);
+ while (src < MIN(last, s_end) && src[0] == 0)
+ src++;
+ *len = src - first - 1 + n;
+ } else {
+ uchar_t *last = src + n;
+ if (d_end - dst < n)
+ break;
+ while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+ *dst++ = *src++;
+ if (src[0])
+ *dst++ = *src++;
+ *len = src - first - 1;
+ }
+ }
+ return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
new file mode 100644
index 000000000000..ec94b08555be
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED ((uint32_t)-1)
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == 0);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ ASSERT(zrl->zr_refcount >= 0);
+ zrl->zr_refcount++;
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT((int32_t)n > ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ int n = (int)zrl->zr_refcount;
+ return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 8f769e6f9a81..507f297674e5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -19,13 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
*/
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
+
+/* Portions Copyright 2010 Robert Milkowski */
/*
* ZFS volume emulation driver.
@@ -36,9 +36,13 @@
* /dev/zvol/dsk/<pool_name>/<dataset_name>
* /dev/zvol/rdsk/<pool_name>/<dataset_name>
*
- * These links are created by the ZFS-specific devfsadm link generator.
+ * These links are created by the /dev filesystem (sdev_zvolops.c).
* Volumes are persistent through reboot. No user command needs to be
* run before opening and using a device.
+ *
+ * FreeBSD notes.
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system.
*/
#include <sys/types.h>
@@ -77,8 +81,6 @@
#include "zfs_namecheck.h"
-#define ZVOL_DUMPSIZE "dumpsize"
-
struct g_class zfs_zvol_class = {
.name = "ZFS::ZVOL",
.version = G_VERSION,
@@ -86,13 +88,18 @@ struct g_class zfs_zvol_class = {
DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+void *zfsdev_state;
+static char *zvol_tag = "zvol_tag";
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
/*
- * This lock protects the zvol_state structure from being modified
+ * This lock protects the zfsdev_state structure from being modified
* while it's being used, e.g. an open that comes in before a create
* finishes. It also protects temporary opens of the dataset so that,
* e.g., an open doesn't get a spurious EBUSY.
*/
-static kmutex_t zvol_state_lock;
+kmutex_t zfsdev_state_lock;
static uint32_t zvol_minors;
typedef struct zvol_extent {
@@ -110,13 +117,13 @@ typedef struct zvol_state {
uint64_t zv_volblocksize; /* volume block size */
struct g_provider *zv_provider; /* GEOM provider */
uint8_t zv_min_bs; /* minimum addressable block shift */
- uint8_t zv_flags; /* readonly; dumpified */
+ uint8_t zv_flags; /* readonly, dumpified, etc. */
objset_t *zv_objset; /* objset handle */
- uint32_t zv_mode; /* DS_MODE_* flags at open time */
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
znode_t zv_znode; /* for range locking */
+ dmu_buf_t *zv_dbuf; /* bonus handle */
int zv_state;
struct bio_queue_head zv_queue;
struct mtx zv_queue_mtx; /* zv_queue mutex */
@@ -128,25 +135,45 @@ typedef struct zvol_state {
#define ZVOL_RDONLY 0x1
#define ZVOL_DUMPIFIED 0x2
#define ZVOL_EXCL 0x4
+#define ZVOL_WCE 0x8
/*
* zvol maximum transfer in one DMU tx.
*/
int zvol_maxphys = DMU_MAX_ACCESS/2;
-extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
+extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
+ nvlist_t *, nvlist_t **);
+static int zvol_remove_zv(zvol_state_t *);
static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
static int zvol_dumpify(zvol_state_t *zv);
static int zvol_dump_fini(zvol_state_t *zv);
static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
+static zvol_state_t *zvol_geom_create(const char *name);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+
static void
-zvol_size_changed(zvol_state_t *zv, major_t maj)
+zvol_size_changed(zvol_state_t *zv)
{
+#ifdef sun
+ dev_t dev = makedevice(maj, min);
+
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Size", volsize) == DDI_SUCCESS);
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
+
+ /* Notify specfs to invalidate the cached size */
+ spec_size_invalidate(dev, VBLK);
+ spec_size_invalidate(dev, VCHR);
+#else /* !sun */
struct g_provider *pp;
- g_topology_assert();
-
pp = zv->zv_provider;
if (pp == NULL)
return;
@@ -159,6 +186,7 @@ zvol_size_changed(zvol_state_t *zv, major_t maj)
if (zv->zv_total_opens > 0)
return;
pp->mediasize = zv->zv_volsize;
+#endif /* !sun */
}
int
@@ -188,17 +216,6 @@ zvol_check_volblocksize(uint64_t volblocksize)
return (0);
}
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
- zvol_state_t *zv = arg;
-
- if (newval)
- zv->zv_flags |= ZVOL_RDONLY;
- else
- zv->zv_flags &= ~ZVOL_RDONLY;
-}
-
int
zvol_get_stats(objset_t *os, nvlist_t *nv)
{
@@ -206,7 +223,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
dmu_object_info_t doi;
uint64_t val;
-
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
if (error)
return (error);
@@ -228,230 +244,24 @@ zvol_minor_lookup(const char *name)
{
struct g_provider *pp;
struct g_geom *gp;
+ zvol_state_t *zv = NULL;
- g_topology_assert();
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ g_topology_lock();
LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
- LIST_FOREACH(pp, &gp->provider, provider) {
- if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
- return (pp->private);
- }
- }
-
- return (NULL);
-}
-
-static int
-zvol_access(struct g_provider *pp, int acr, int acw, int ace)
-{
- zvol_state_t *zv;
-
- g_topology_assert();
- mutex_enter(&zvol_state_lock);
-
- zv = pp->private;
- if (zv == NULL) {
- if (acr <= 0 && acw <= 0 && ace <= 0)
- return (0);
- mutex_exit(&zvol_state_lock);
- return (pp->error);
- }
-
- ASSERT(zv->zv_objset != NULL);
-
- if (acw > 0 &&
- ((zv->zv_flags & ZVOL_RDONLY) ||
- (zv->zv_mode & DS_MODE_READONLY))) {
- mutex_exit(&zvol_state_lock);
- return (EROFS);
- }
-
- zv->zv_total_opens += acr + acw + ace;
- zvol_size_changed(zv, 0);
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
-{
- uint32_t blocksize = zv->zv_volblocksize;
- zilog_t *zilog = zv->zv_zilog;
- lr_write_t *lr;
-
- if (zilog->zl_replay) {
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
- zilog->zl_replaying_seq;
- return;
- }
-
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
- lr = (lr_write_t *)&itx->itx_lr;
- lr->lr_foid = ZVOL_OBJ;
- lr->lr_offset = off;
- lr->lr_length = nbytes;
- lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
- BP_ZERO(&lr->lr_blkptr);
-
- (void) zil_itx_assign(zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
- }
-}
-
-static void
-zvol_start(struct bio *bp)
-{
- zvol_state_t *zv;
-
- switch (bp->bio_cmd) {
- case BIO_READ:
- case BIO_WRITE:
- case BIO_FLUSH:
- zv = bp->bio_to->private;
- ASSERT(zv != NULL);
- mtx_lock(&zv->zv_queue_mtx);
- bioq_insert_tail(&zv->zv_queue, bp);
- wakeup_one(&zv->zv_queue);
- mtx_unlock(&zv->zv_queue_mtx);
- break;
- case BIO_GETATTR:
- if (g_handleattr_int(bp, "ZFS::iszvol", 1))
- break;
- /* FALLTHROUGH */
- case BIO_DELETE:
- default:
- g_io_deliver(bp, EOPNOTSUPP);
- break;
- }
-}
-
-static void
-zvol_serve_one(zvol_state_t *zv, struct bio *bp)
-{
- uint64_t off, volsize;
- size_t resid;
- char *addr;
- objset_t *os;
- rl_t *rl;
- int error = 0;
- boolean_t doread = (bp->bio_cmd == BIO_READ);
-
- off = bp->bio_offset;
- volsize = zv->zv_volsize;
-
- os = zv->zv_objset;
- ASSERT(os != NULL);
-
- addr = bp->bio_data;
- resid = bp->bio_length;
-
- error = 0;
-
- /*
- * There must be no buffer changes when doing a dmu_sync() because
- * we can't change the data whilst calculating the checksum.
- * A better approach than a per zvol rwlock would be to lock ranges.
- */
- rl = zfs_range_lock(&zv->zv_znode, off, resid,
- doread ? RL_READER : RL_WRITER);
-
- while (resid != 0 && off < volsize) {
- size_t size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
-
- if (size > volsize - off) /* don't write past the end */
- size = volsize - off;
-
- if (doread) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr,
- DMU_READ_PREFETCH);
- } else {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
- dmu_tx_commit(tx);
- }
- }
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = EIO;
- break;
- }
- off += size;
- addr += size;
- resid -= size;
- }
- zfs_range_unlock(rl);
-
- bp->bio_completed = bp->bio_length - resid;
- if (bp->bio_completed < bp->bio_length)
- bp->bio_error = (off > volsize ? EINVAL : error);
-}
-
-static void
-zvol_worker(void *arg)
-{
- zvol_state_t *zv;
- struct bio *bp;
-
- thread_lock(curthread);
- sched_prio(curthread, PRIBIO);
- thread_unlock(curthread);
-
- zv = arg;
- for (;;) {
- mtx_lock(&zv->zv_queue_mtx);
- bp = bioq_takefirst(&zv->zv_queue);
- if (bp == NULL) {
- if (zv->zv_state == 1) {
- zv->zv_state = 2;
- wakeup(&zv->zv_state);
- mtx_unlock(&zv->zv_queue_mtx);
- kthread_exit();
- }
- msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
- "zvol:io", 0);
+ pp = LIST_FIRST(&gp->provider);
+ if (pp == NULL)
continue;
- }
- mtx_unlock(&zv->zv_queue_mtx);
- switch (bp->bio_cmd) {
- case BIO_FLUSH:
- break;
- case BIO_READ:
- case BIO_WRITE:
- zvol_serve_one(zv, bp);
+ zv = pp->private;
+ if (zv == NULL)
+ continue;
+ if (strcmp(zv->zv_name, name) == 0)
break;
- }
-
- if (bp->bio_cmd == BIO_FLUSH && !zil_disable)
- zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
-
- g_io_deliver(bp, bp->bio_error);
}
+ g_topology_unlock();
+
+ return (gp != NULL ? zv : NULL);
}
/* extent mapping arg */
@@ -462,8 +272,8 @@ struct maparg {
/*ARGSUSED*/
static int
-zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct maparg *ma = arg;
zvol_extent_t *ze;
@@ -515,6 +325,7 @@ zvol_free_extents(zvol_state_t *zv)
static int
zvol_get_lbas(zvol_state_t *zv)
{
+ objset_t *os = zv->zv_objset;
struct maparg ma;
int err;
@@ -522,7 +333,9 @@ zvol_get_lbas(zvol_state_t *zv)
ma.ma_blks = 0;
zvol_free_extents(zv);
- err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0,
+ /* commit any in-flight changes before traversing the dataset */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = traverse_dataset(dmu_objset_ds(os), 0,
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
zvol_free_extents(zv);
@@ -577,25 +390,32 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
{
objset_t *os = zv->zv_objset;
char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
+ uint64_t offset, length;
dmu_tx_t *tx;
int error;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(dmu_objset_zil(os), lr);
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
dmu_tx_commit(tx);
}
@@ -636,58 +456,101 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_WRITE2 */
};
+#ifdef sun
+int
+zvol_name2minor(const char *name, minor_t *minor)
+{
+ zvol_state_t *zv;
+
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+ if (minor && zv)
+ *minor = zv->zv_minor;
+ mutex_exit(&zfsdev_state_lock);
+ return (zv ? 0 : -1);
+}
+#endif /* sun */
+
/*
* Create a minor node (plus a whole lot more) for the specified volume.
*/
int
-zvol_create_minor(const char *name, major_t maj)
+zvol_create_minor(const char *name)
{
- struct g_provider *pp;
- struct g_geom *gp;
+ zfs_soft_state_t *zs;
zvol_state_t *zv;
objset_t *os;
dmu_object_info_t doi;
- uint64_t volsize;
- int ds_mode = DS_MODE_OWNER;
int error;
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ ZFS_LOG(1, "Creating ZVOL %s...", name);
- if ((zv = zvol_minor_lookup(name)) != NULL) {
- error = EEXIST;
- goto end;
+ mutex_enter(&zfsdev_state_lock);
+
+ if (zvol_minor_lookup(name) != NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (EEXIST);
}
- if (strchr(name, '@') != 0)
- ds_mode |= DS_MODE_READONLY;
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
- error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
- if (error)
- goto end;
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
- dmu_objset_close(os);
- goto end;
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
}
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_start;
- gp->access = zvol_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
- pp->mediasize = volsize;
- pp->sectorsize = DEV_BSIZE;
+#ifdef sun
+ if ((minor = zfsdev_minor_alloc()) == 0) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
+ }
- zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
- (void) strcpy(zv->zv_name, name);
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+ (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
+ (char *)name);
+
+ (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
+
+ if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+
+ (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
+
+ if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(zfs_dip, chrbuf);
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_ZVOL;
+ zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+#else /* !sun */
+
+ DROP_GIANT();
+ g_topology_lock();
+ zv = zvol_geom_create(name);
+#endif /* !sun */
+
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
zv->zv_min_bs = DEV_BSHIFT;
- zv->zv_provider = pp;
- zv->zv_volsize = pp->mediasize;
zv->zv_objset = os;
- zv->zv_mode = ds_mode;
- zv->zv_zilog = zil_open(os, zvol_get_data);
+ if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
@@ -698,93 +561,134 @@ zvol_create_minor(const char *name, major_t maj)
ASSERT(error == 0);
zv->zv_volblocksize = doi.doi_data_block_size;
- zil_replay(os, zv, zvol_replay_vector);
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ dmu_objset_disown(os, FTAG);
+ zv->zv_objset = NULL;
- /* XXX this should handle the possible i/o error */
- VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
+ zvol_minors++;
- pp->private = zv;
- g_error_provider(pp, 0);
+ mutex_exit(&zfsdev_state_lock);
- bioq_init(&zv->zv_queue);
- mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
- zv->zv_state = 0;
- kproc_kthread_add(zvol_worker, zv, &zfsproc, NULL, 0, 0, "zfskern",
- "zvol %s", pp->name + strlen(ZVOL_DEV_DIR) + 1);
+ zvol_geom_run(zv);
- zvol_minors++;
-end:
- mutex_exit(&zvol_state_lock);
g_topology_unlock();
PICKUP_GIANT();
- return (error);
+ ZFS_LOG(1, "ZVOL %s created.", name);
+
+ return (0);
}
/*
* Remove minor node for the specified volume.
*/
+static int
+zvol_remove_zv(zvol_state_t *zv)
+{
+#ifdef sun
+ minor_t minor = zv->zv_minor;
+#endif
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ if (zv->zv_total_opens != 0)
+ return (EBUSY);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+#ifdef sun
+ (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
+ ddi_remove_minor_node(zfs_dip, nmbuf);
+#endif /* sun */
+
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
+
+ zvol_geom_destroy(zv);
+
+ zvol_minors--;
+ return (0);
+}
+
int
zvol_remove_minor(const char *name)
{
- struct g_provider *pp;
zvol_state_t *zv;
- int error = 0;
-
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ int rc;
+ mutex_enter(&zfsdev_state_lock);
if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
+ g_topology_lock();
+ rc = zvol_remove_zv(zv);
+ g_topology_unlock();
+ mutex_exit(&zfsdev_state_lock);
+ return (rc);
+}
- if (zv->zv_total_opens != 0) {
- error = EBUSY;
- goto end;
- }
+int
+zvol_first_open(zvol_state_t *zv)
+{
+ objset_t *os;
+ uint64_t volsize;
+ int error;
+ uint64_t readonly;
- VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
+ /* lie and say we're read-only */
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
+ zvol_tag, &os);
+ if (error)
+ return (error);
- mtx_lock(&zv->zv_queue_mtx);
- zv->zv_state = 1;
- wakeup_one(&zv->zv_queue);
- while (zv->zv_state != 2)
- msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
- mtx_unlock(&zv->zv_queue_mtx);
- mtx_destroy(&zv->zv_queue_mtx);
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error) {
+ ASSERT(error == 0);
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+ zv->zv_objset = os;
+ error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+ if (error) {
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+ zv->zv_volsize = volsize;
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+ zvol_size_changed(zv);
- pp = zv->zv_provider;
- pp->private = NULL;
- g_wither_geom(pp->geom, ENXIO);
+ VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
+ NULL) == 0);
+ if (readonly || dmu_objset_is_snapshot(os) ||
+ !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+ else
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ return (error);
+}
+void
+zvol_last_close(zvol_state_t *zv)
+{
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- dmu_objset_close(zv->zv_objset);
+ dmu_buf_rele(zv->zv_dbuf, zvol_tag);
+ zv->zv_dbuf = NULL;
+ dmu_objset_disown(zv->zv_objset, zvol_tag);
zv->zv_objset = NULL;
- avl_destroy(&zv->zv_znode.z_range_avl);
- mutex_destroy(&zv->zv_znode.z_range_lock);
-
- kmem_free(zv, sizeof(*zv));
-
- zvol_minors--;
-end:
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
}
+#ifdef sun
int
zvol_prealloc(zvol_state_t *zv)
{
objset_t *os = zv->zv_objset;
dmu_tx_t *tx;
- void *data;
uint64_t refd, avail, usedobjs, availobjs;
uint64_t resid = zv->zv_volsize;
uint64_t off = 0;
@@ -797,9 +701,6 @@ zvol_prealloc(zvol_state_t *zv)
/* Free old extents if they exist */
zvol_free_extents(zv);
- /* allocate the blocks by writing each one */
- data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
-
while (resid != 0) {
int error;
uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
@@ -809,30 +710,29 @@ zvol_prealloc(zvol_state_t *zv)
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- kmem_free(data, SPA_MAXBLOCKSIZE);
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
return (error);
}
- dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
+ dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
dmu_tx_commit(tx);
off += bytes;
resid -= bytes;
}
- kmem_free(data, SPA_MAXBLOCKSIZE);
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
+#endif /* sun */
int
-zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
+zvol_update_volsize(objset_t *os, uint64_t volsize)
{
dmu_tx_t *tx;
int error;
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
- tx = dmu_tx_create(zv->zv_objset);
+ tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -840,140 +740,232 @@ zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
return (error);
}
- error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
&volsize, tx);
dmu_tx_commit(tx);
if (error == 0)
- error = dmu_free_long_range(zv->zv_objset,
+ error = dmu_free_long_range(os,
ZVOL_OBJ, volsize, DMU_OBJECT_END);
+ return (error);
+}
- /*
- * If we are using a faked-up state (zv_provider == NULL) then don't
- * try to update the in-core zvol state.
- */
- if (error == 0 && zv->zv_provider) {
- zv->zv_volsize = volsize;
- zvol_size_changed(zv, maj);
+void
+zvol_remove_minors(const char *name)
+{
+ struct g_provider *pp, *pptmp;
+ struct g_geom *gp, *gptmp;
+ zvol_state_t *zv;
+ char *namebuf;
+
+ namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+ (void) strncpy(namebuf, name, strlen(name));
+ (void) strcat(namebuf, "/");
+ DROP_GIANT();
+ mutex_enter(&zfsdev_state_lock);
+ g_topology_lock();
+
+ LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
+ pp = LIST_FIRST(&gp->provider);
+ if (pp == NULL)
+ continue;
+ zv = pp->private;
+ if (zv == NULL)
+ continue;
+ if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+ (void) zvol_remove_zv(zv);
}
- return (error);
+ kmem_free(namebuf, strlen(name) + 2);
+
+ g_topology_unlock();
+ mutex_exit(&zfsdev_state_lock);
+ PICKUP_GIANT();
}
int
zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
+ objset_t *os;
int error;
dmu_object_info_t doi;
uint64_t old_volsize = 0ULL;
- zvol_state_t state = { 0 };
-
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ uint64_t readonly;
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- /*
- * If we are doing a "zfs clone -o volsize=", then the
- * minor node won't exist yet.
- */
- error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
- &state.zv_objset);
- if (error != 0)
- goto out;
- zv = &state;
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
}
- old_volsize = zv->zv_volsize;
- if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+ if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
(error = zvol_check_volsize(volsize,
doi.doi_data_block_size)) != 0)
goto out;
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
+ VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
+ NULL) == 0);
+ if (readonly) {
error = EROFS;
goto out;
}
- error = zvol_update_volsize(zv, maj, volsize);
-
-#if 0
+ error = zvol_update_volsize(os, volsize);
/*
* Reinitialize the dump area to the new size. If we
- * failed to resize the dump area then restore the it back to
- * it's original size.
+ * failed to resize the dump area then restore it back to
+ * its original size.
*/
- if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
- if ((error = zvol_dumpify(zv)) != 0 ||
- (error = dumpvp_resize()) != 0) {
- (void) zvol_update_volsize(zv, maj, old_volsize);
- error = zvol_dumpify(zv);
+ if (zv && error == 0) {
+#ifdef ZVOL_DUMP
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ old_volsize = zv->zv_volsize;
+ zv->zv_volsize = volsize;
+ if ((error = zvol_dumpify(zv)) != 0 ||
+ (error = dumpvp_resize()) != 0) {
+ (void) zvol_update_volsize(os, old_volsize);
+ zv->zv_volsize = old_volsize;
+ error = zvol_dumpify(zv);
+ }
+ }
+#endif /* ZVOL_DUMP */
+ if (error == 0) {
+ zv->zv_volsize = volsize;
+ zvol_size_changed(zv);
}
}
-#endif
+
+#ifdef sun
+ /*
+ * Generate a LUN expansion event.
+ */
+ if (zv && error == 0) {
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
+ zv->zv_minor);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+#endif /* sun */
out:
- if (state.zv_objset)
- dmu_objset_close(state.zv_objset);
+ dmu_objset_rele(os, FTAG);
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+/*ARGSUSED*/
+static int
+zvol_open(struct g_provider *pp, int flag, int count)
{
zvol_state_t *zv;
- dmu_tx_t *tx;
- int error;
+ int err = 0;
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ mutex_enter(&zfsdev_state_lock);
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
+ zv = pp->private;
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
- error = EROFS;
- goto end;
+
+ if (zv->zv_total_opens == 0)
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = EROFS;
+ goto out;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = EBUSY;
+ goto out;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_total_opens != 0) {
+ err = EBUSY;
+ goto out;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
}
+#endif
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
- volblocksize, 0, tx);
- if (error == ENOTSUP)
- error = EBUSY;
- dmu_tx_commit(tx);
- if (error == 0)
- zv->zv_volblocksize = volblocksize;
+ zv->zv_total_opens += count;
+ mutex_exit(&zfsdev_state_lock);
+
+ return (err);
+out:
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+zvol_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int error = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = pp->private;
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
-end:
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_total_opens == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_total_opens -= count;
+
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
+static void
+zvol_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_range_unlock(zgd->zgd_rl);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
}
@@ -985,15 +977,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
{
zvol_state_t *zv = arg;
objset_t *os = zv->zv_objset;
+ uint64_t object = ZVOL_OBJ;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length; /* length of user data */
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- uint64_t boff; /* block starting offset */
- int dlen = lr->lr_length; /* length of user data */
int error;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
+
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -1002,97 +999,717 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
- if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
- DMU_READ_NO_PREFETCH));
+ if (buf != NULL) { /* immediate write */
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else {
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN(offset, size);
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_bp = &lr->lr_blkptr;
+ zvol_get_done(zgd, error);
- /*
- * Lock the range of the block to ensure that when the data is
- * written out and its checksum is being calculated that no other
- * thread can change the block.
- */
- boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
- rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
- RL_READER);
- zgd->zgd_rl = rl;
+ return (error);
+}
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ zilog_t *zilog = zv->zv_zilog;
+ boolean_t slogging;
+ ssize_t immediate_write_sz;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ ? 0 : zvol_immediate_write_sz;
+
+ slogging = spa_has_slogs(zilog->zl_spa) &&
+ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ ssize_t len;
+ itx_wr_state_t write_state;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0) {
/*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to
- * replay.
+ * Unlike zfs_log_write() we can be called with
+ * upto DMU_MAX_ACCESS/2 (5MB) writes.
*/
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+ if (blocksize > immediate_write_sz && !slogging &&
+ resid >= blocksize && off % blocksize == 0) {
+ write_state = WR_INDIRECT; /* uses dmu_sync */
+ len = blocksize;
+ } else if (sync) {
+ write_state = WR_COPIED;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ } else {
+ write_state = WR_NEED_COPY;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ }
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = write_state;
+ if (write_state == WR_NEED_COPY)
+ itx->itx_sod += len;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zv;
+ itx->itx_sync = sync;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ off += len;
+ resid -= len;
+ }
+}
+
+#ifdef sun
+static int
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
+ boolean_t doread, boolean_t isdump)
+{
+ vdev_disk_t *dvd;
+ int c;
+ int numerrors = 0;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (numerrors < vd->vdev_children ? 0 : EIO);
+
+ if (doread && !vdev_readable(vd))
+ return (EIO);
+ else if (!doread && !vdev_writeable(vd))
+ return (EIO);
+
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ offset += VDEV_LABEL_START_SIZE;
+
+ if (ddi_in_panic() || isdump) {
+ ASSERT(!doread);
+ if (doread)
+ return (EIO);
+ return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
+ lbtodb(size)));
+ } else {
+ return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
+ doread ? B_READ : B_WRITE));
}
+}
+
+static int
+zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
+ boolean_t doread, boolean_t isdump)
+{
+ vdev_t *vd;
+ int error;
+ zvol_extent_t *ze;
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+
+ /* Must be sector aligned, and not stradle a block boundary. */
+ if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
+ P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
+ return (EINVAL);
+ }
+ ASSERT(size <= zv->zv_volblocksize);
+
+ /* Locate the extent this belongs to */
+ ze = list_head(&zv->zv_extents);
+ while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
+ offset -= ze->ze_nblks * zv->zv_volblocksize;
+ ze = list_next(&zv->zv_extents, ze);
+ }
+
+ if (!ddi_in_panic())
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
+ offset += DVA_GET_OFFSET(&ze->ze_dva);
+ error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+
+ if (!ddi_in_panic())
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ return (error);
+}
+#endif /* sun */
+
+int
+zvol_strategy(struct bio *bp)
+{
+ zvol_state_t *zv = bp->bio_to->private;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ rl_t *rl;
+ int error = 0;
+ boolean_t doread = (bp->bio_cmd == BIO_READ);
+ boolean_t sync;
+
+ if (zv == NULL) {
+ g_io_deliver(bp, ENXIO);
+ return (0);
+ }
+
+ if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+ g_io_deliver(bp, EROFS);
+ return (0);
+ }
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ g_io_deliver(bp, EIO);
+ return (0);
+ }
+
+ sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
/*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zvol_get_done() callback.
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
*/
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
+ rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+ if (doread) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
zfs_range_unlock(rl);
- kmem_free(zgd, sizeof (zgd_t));
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length)
+ bp->bio_error = (off > volsize ? EINVAL : error);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+
+ return (0);
+}
+
+#ifdef sun
+/*
+ * Set the buffer count to the zvol maximum transfer.
+ * Using our own routine instead of the default minphys()
+ * means that for larger writes we write bigger buffers on X86
+ * (128K instead of 56K) and flush the disk write cache less often
+ * (every zvol_maxphys - currently 1MB) instead of minphys (currently
+ * 56K on X86 and 128K on sparc).
+ */
+void
+zvol_minphys(struct buf *bp)
+{
+ if (bp->b_bcount > zvol_maxphys)
+ bp->b_bcount = zvol_maxphys;
+}
+
+int
+zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ int error = 0;
+ uint64_t size;
+ uint64_t boff;
+ uint64_t resid;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ boff = ldbtob(blkno);
+ resid = ldbtob(nblocks);
+
+ VERIFY3U(boff + resid, <=, zv->zv_volsize);
+
+ while (resid) {
+ size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
+ error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
+ if (error)
+ break;
+ boff += size;
+ addr += size;
+ resid -= size;
+ }
+
return (error);
}
+/*ARGSUSED*/
int
-zvol_busy(void)
+zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
{
- return (zvol_minors != 0);
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ uint64_t volsize;
+ rl_t *rl;
+ int error = 0;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ volsize = zv->zv_volsize;
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ return (EIO);
+
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_READ,
+ zvol_minphys, uio);
+ return (error);
+ }
+
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_READER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio->uio_loffset)
+ bytes = volsize - uio->uio_loffset;
+
+ error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
+ break;
+ }
+ }
+ zfs_range_unlock(rl);
+ return (error);
}
-void
-zvol_init(void)
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
{
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
- ZFS_LOG(1, "ZVOL Initialized.");
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ uint64_t volsize;
+ rl_t *rl;
+ int error = 0;
+ boolean_t sync;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ volsize = zv->zv_volsize;
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ return (EIO);
+
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_WRITE,
+ zvol_minphys, uio);
+ return (error);
+ }
+
+ sync = !(zv->zv_flags & ZVOL_WCE) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_WRITER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio->uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_range_unlock(rl);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ return (error);
}
+int
+zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
+{
+ struct uuid uuid = EFI_RESERVED;
+ efi_gpe_t gpe = { 0 };
+ uint32_t crc;
+ dk_efi_t efi;
+ int length;
+ char *ptr;
+
+ if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
+ return (EFAULT);
+ ptr = (char *)(uintptr_t)efi.dki_data_64;
+ length = efi.dki_length;
+ /*
+ * Some clients may attempt to request a PMBR for the
+ * zvol. Currently this interface will return EINVAL to
+ * such requests. These requests could be supported by
+ * adding a check for lba == 0 and consing up an appropriate
+ * PMBR.
+ */
+ if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
+ return (EINVAL);
+
+ gpe.efi_gpe_StartingLBA = LE_64(34ULL);
+ gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
+ UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+
+ if (efi.dki_lba == 1) {
+ efi_gpt_t gpt = { 0 };
+
+ gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+ gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
+ gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+ gpt.efi_gpt_MyLBA = LE_64(1ULL);
+ gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
+ gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
+ gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
+ gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+ gpt.efi_gpt_SizeOfPartitionEntry =
+ LE_32(sizeof (efi_gpe_t));
+ CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+ gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+ CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+ gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+ if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
+ flag))
+ return (EFAULT);
+ ptr += sizeof (gpt);
+ length -= sizeof (gpt);
+ }
+ if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
+ length), flag))
+ return (EFAULT);
+ return (0);
+}
+
+/*
+ * BEGIN entry points to allow external callers access to the volume.
+ */
+/*
+ * Return the volume parameters needed for access from an external caller.
+ * These values are invariant as long as the volume is held open.
+ */
+int
+zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl)
+{
+ zvol_state_t *zv;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+ if (zv->zv_flags & ZVOL_DUMPIFIED)
+ return (ENXIO);
+
+ ASSERT(blksize && max_xfer_len && minor_hdl &&
+ objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+
+ *blksize = zv->zv_volblocksize;
+ *max_xfer_len = (uint64_t)zvol_maxphys;
+ *minor_hdl = zv;
+ *objset_hdl = zv->zv_objset;
+ *zil_hdl = zv->zv_zilog;
+ *rl_hdl = &zv->zv_znode;
+ *bonus_hdl = zv->zv_dbuf;
+ return (0);
+}
+
+/*
+ * Return the current volume size to an external caller.
+ * The size can change while the volume is open.
+ */
+uint64_t
+zvol_get_volume_size(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return (zv->zv_volsize);
+}
+
+/*
+ * Return the current WCE setting to an external caller.
+ * The WCE setting can change while the volume is open.
+ */
+int
+zvol_get_volume_wce(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
+}
+
+/*
+ * Entry point for external callers to zvol_log_write
+ */
void
-zvol_fini(void)
+zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
{
- mutex_destroy(&zvol_state_lock);
- ZFS_LOG(1, "ZVOL Deinitialized.");
+ zvol_state_t *zv = minor_hdl;
+
+ zvol_log_write(zv, tx, off, resid, sync);
}
+/*
+ * END entry points to allow external callers access to the volume.
+ */
-static boolean_t
-zvol_is_swap(zvol_state_t *zv)
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
{
- vnode_t *vp;
- boolean_t ret = B_FALSE;
- char *devpath;
- size_t devpathlen;
- int error;
+ zvol_state_t *zv;
+ struct dk_cinfo dki;
+ struct dk_minfo dkm;
+ struct dk_callback *dkc;
+ int error = 0;
+ rl_t *rl;
-#if 0
- devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
- devpath = kmem_alloc(devpathlen, KM_SLEEP);
- (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
- error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- kmem_free(devpath, devpathlen);
+ mutex_enter(&zfsdev_state_lock);
- ret = !error && IS_SWAPVP(common_specvp(vp));
+ zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
- if (vp != NULL)
- VN_RELE(vp);
-#endif
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
+ }
+ ASSERT(zv->zv_total_opens > 0);
+
+ switch (cmd) {
+
+ case DKIOCINFO:
+ bzero(&dki, sizeof (dki));
+ (void) strcpy(dki.dki_cname, "zvol");
+ (void) strcpy(dki.dki_dname, "zvol");
+ dki.dki_ctype = DKC_UNKNOWN;
+ dki.dki_unit = getminor(dev);
+ dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGMEDIAINFO:
+ bzero(&dkm, sizeof (dkm));
+ dkm.dki_lbsize = 1U << zv->zv_min_bs;
+ dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkm.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGETEFI:
+ {
+ uint64_t vs = zv->zv_volsize;
+ uint8_t bs = zv->zv_min_bs;
+
+ mutex_exit(&zfsdev_state_lock);
+ error = zvol_getefi((void *)arg, flag, vs, bs);
+ return (error);
+ }
- return (ret);
+ case DKIOCFLUSHWRITECACHE:
+ dkc = (struct dk_callback *)arg;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
+ (*dkc->dkc_callback)(dkc->dkc_cookie, error);
+ error = 0;
+ }
+ return (error);
+
+ case DKIOCGETWCE:
+ {
+ int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+ if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+ flag))
+ error = EFAULT;
+ break;
+ }
+ case DKIOCSETWCE:
+ {
+ int wce;
+ if (ddi_copyin((void *)arg, &wce, sizeof (int),
+ flag)) {
+ error = EFAULT;
+ break;
+ }
+ if (wce) {
+ zv->zv_flags |= ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ } else {
+ zv->zv_flags &= ~ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+ return (0);
+ }
+
+ case DKIOCGGEOM:
+ case DKIOCGVTOC:
+ /*
+ * commands using these (like prtvtoc) expect ENOTSUP
+ * since we're emulating an EFI label
+ */
+ error = ENOTSUP;
+ break;
+
+ case DKIOCDUMPINIT:
+ rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dumpify(zv);
+ zfs_range_unlock(rl);
+ break;
+
+ case DKIOCDUMPFINI:
+ if (!(zv->zv_flags & ZVOL_DUMPIFIED))
+ break;
+ rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dump_fini(zv);
+ zfs_range_unlock(rl);
+ break;
+
+ default:
+ error = ENOTTY;
+ break;
+
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+}
+#endif /* sun */
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
+ 1) == 0);
+ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ ZFS_LOG(1, "ZVOL Initialized.");
}
+void
+zvol_fini(void)
+{
+ mutex_destroy(&zfsdev_state_lock);
+ ddi_soft_state_fini(&zfsdev_state);
+ ZFS_LOG(1, "ZVOL Deinitialized.");
+}
+
+#ifdef sun
static int
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
{
@@ -1100,11 +1717,17 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
int error = 0;
objset_t *os = zv->zv_objset;
nvlist_t *nv = NULL;
+ uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
+ DMU_OBJECT_END);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -1122,7 +1745,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&zv->zv_volsize, tx);
} else {
- uint64_t checksum, compress, refresrv, vbs;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
error = dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
@@ -1132,6 +1755,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
error = error ? error : dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
+ if (version >= SPA_VERSION_DEDUP) {
+ error = error ? error :
+ dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+ }
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
@@ -1144,17 +1772,18 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
&vbs, tx);
+ error = error ? error : dmu_object_set_blocksize(
+ os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+ if (version >= SPA_VERSION_DEDUP) {
+ error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
+ &dedup, tx);
+ }
+ if (error == 0)
+ zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
- /* Truncate the file */
- if (!error)
- error = dmu_free_long_range(zv->zv_objset,
- ZVOL_OBJ, 0, DMU_OBJECT_END);
-
- if (error)
- return (error);
-
/*
* We only need update the zvol's property if we are initializing
* the dump area for the first time.
@@ -1169,11 +1798,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
ZIO_CHECKSUM_OFF) == 0);
- VERIFY(nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- SPA_MAXBLOCKSIZE) == 0);
+ if (version >= SPA_VERSION_DEDUP) {
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP),
+ ZIO_CHECKSUM_OFF) == 0);
+ }
- error = zfs_set_prop_nvlist(zv->zv_name, nv);
+ error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
nvlist_free(nv);
if (error)
@@ -1193,15 +1825,9 @@ zvol_dumpify(zvol_state_t *zv)
dmu_tx_t *tx;
objset_t *os = zv->zv_objset;
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
+ if (zv->zv_flags & ZVOL_RDONLY)
return (EROFS);
- /*
- * We do not support swap devices acting as dump devices.
- */
- if (zvol_is_swap(zv))
- return (ENOTSUP);
-
if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
@@ -1251,7 +1877,8 @@ zvol_dump_fini(zvol_state_t *zv)
objset_t *os = zv->zv_objset;
nvlist_t *nv;
int error = 0;
- uint64_t checksum, compress, refresrv, vbs;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
+ uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
/*
* Attempt to restore the zvol back to its pre-dumpified state.
@@ -1286,14 +1913,312 @@ zvol_dump_fini(zvol_state_t *zv)
zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
(void) nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
- (void) nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
- (void) zfs_set_prop_nvlist(zv->zv_name, nv);
+ if (version >= SPA_VERSION_DEDUP &&
+ zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
+ }
+ (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
nvlist_free(nv);
zvol_free_extents(zv);
zv->zv_flags &= ~ZVOL_DUMPIFIED;
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+ zv->zv_volblocksize = vbs;
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+#endif /* sun */
+
+static zvol_state_t *
+zvol_geom_create(const char *name)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+ zvol_state_t *zv;
+
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->sectorsize = DEV_BSIZE;
+
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ zv->zv_provider = pp;
+ zv->zv_state = 0;
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+
+ pp->private = zv;
+
+ return (zv);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ pp = zv->zv_provider;
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ g_topology_assert();
+
+ mtx_lock(&zv->zv_queue_mtx);
+ zv->zv_state = 1;
+ wakeup_one(&zv->zv_queue);
+ while (zv->zv_state != 2)
+ msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+ mtx_destroy(&zv->zv_queue_mtx);
+
+ pp = zv->zv_provider;
+ zv->zv_provider = NULL;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+
+ kmem_free(zv, sizeof(*zv));
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
+ * because GEOM already handles that and handles it a bit differently.
+ * GEOM allows for multiple read/exclusive consumers and ZFS allows
+ * only one exclusive consumer, no matter if it is reader or writer.
+ * I like better the way GEOM works so I'll leave it for GEOM to
+ * decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_open(pp, flags, count);
+ else
+ error = zvol_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_start(struct bio *bp)
+{
+ zvol_state_t *zv;
+ boolean_t first;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+ mtx_lock(&zv->zv_queue_mtx);
+ first = (bioq_first(&zv->zv_queue) == NULL);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (first)
+ wakeup_one(&zv->zv_queue);
+ break;
+ case BIO_GETATTR:
+ case BIO_DELETE:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv;
+ struct bio *bp;
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ zv = arg;
+ for (;;) {
+ mtx_lock(&zv->zv_queue_mtx);
+ bp = bioq_takefirst(&zv->zv_queue);
+ if (bp == NULL) {
+ if (zv->zv_state == 1) {
+ zv->zv_state = 2;
+ wakeup(&zv->zv_state);
+ mtx_unlock(&zv->zv_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+ "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zv->zv_queue_mtx);
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ zvol_strategy(bp);
+ break;
+ }
+ }
+}
+
+extern boolean_t dataset_name_hidden(const char *name);
+
+static int
+zvol_create_snapshots(objset_t *os, const char *name)
+{
+ uint64_t cookie, obj;
+ char *sname;
+ int error, len;
+
+ cookie = obj = 0;
+ sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
+ DS_FIND_SNAPSHOTS);
+
+ for (;;) {
+ len = snprintf(sname, MAXPATHLEN, "%s@", name);
+ if (len >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ error = ENAMETOOLONG;
+ break;
+ }
+
+ error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
+ sname + len, &obj, &cookie, NULL);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ if ((error = zvol_create_minor(sname)) != 0) {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ sname, error);
+ break;
+ }
+ }
+
+ kmem_free(sname, MAXPATHLEN);
+ return (error);
+}
+
+int
+zvol_create_minors(const char *name)
+{
+ uint64_t cookie;
+ objset_t *os;
+ char *osname, *p;
+ int error, len;
+
+ if (dataset_name_hidden(name))
+ return (0);
+
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ if (dmu_objset_type(os) == DMU_OST_ZVOL) {
+ if ((error = zvol_create_minor(name)) == 0)
+ error = zvol_create_snapshots(os, name);
+ else {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ name, error);
+ }
+ dmu_objset_rele(os, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+
+ osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
+ return (ENOENT);
+ }
+ p = osname + strlen(osname);
+ len = MAXPATHLEN - (p - osname);
+
+ if (strchr(name, '/') == NULL) {
+ /* Prefetch only for pool name. */
+ cookie = 0;
+ while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
+ (void) dmu_objset_prefetch(p, NULL);
+ }
+
+ cookie = 0;
+ while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
+ &cookie) == 0) {
+ dmu_objset_rele(os, FTAG);
+ (void)zvol_create_minors(osname);
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ }
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
index 59ee7818f56d..da397a5cca95 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/types.h>
#include <sys/time.h>
@@ -319,9 +317,9 @@ callb_generic_cpr(void *arg, int code)
#ifdef CPR_NOT_THREAD_SAFE
while (!(cp->cc_events & CALLB_CPR_SAFE))
/* cv_timedwait() returns -1 if it times out. */
- if ((ret = cv_timedwait(&cp->cc_callb_cv,
- cp->cc_lockp,
- callb_timeout_sec * hz)) == -1)
+ if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+ cp->cc_lockp, (callb_timeout_sec * hz),
+ TR_CLOCK_TICK)) == -1)
break;
#endif
break;
@@ -370,5 +368,71 @@ callb_unlock_table(void)
mutex_exit(&ct->ct_lock);
}
+#ifdef sun
+/*
+ * Return a boolean value indicating whether a particular kernel thread is
+ * stopped in accordance with the cpr callback protocol. If returning
+ * false, also return a pointer to the thread name via the 2nd argument.
+ */
+boolean_t
+callb_is_stopped(kthread_id_t tp, caddr_t *thread_name)
+{
+ callb_t *cp;
+ boolean_t ret_val;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON];
+ cp != NULL && tp != cp->c_thread; cp = cp->c_next)
+ ;
+
+ ret_val = (cp != NULL);
+ if (ret_val) {
+ /*
+ * We found the thread in the callback table and have
+ * provisionally set the return value to true. Now
+ * see if it is marked "safe" and is sleeping or stopped.
+ */
+ callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg;
+
+ *thread_name = cp->c_name; /* in case not stopped */
+ mutex_enter(ccp->cc_lockp);
+
+ if (ccp->cc_events & CALLB_CPR_SAFE) {
+ int retry;
+
+ mutex_exit(ccp->cc_lockp);
+ for (retry = 0; retry < CALLB_MAX_RETRY; retry++) {
+ thread_lock(tp);
+ if (tp->t_state & (TS_SLEEP | TS_STOPPED)) {
+ thread_unlock(tp);
+ break;
+ }
+ thread_unlock(tp);
+ delay(CALLB_THREAD_DELAY);
+ }
+ ret_val = retry < CALLB_MAX_RETRY;
+ } else {
+ ret_val =
+ (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0;
+ mutex_exit(ccp->cc_lockp);
+ }
+ } else {
+ /*
+ * Thread not found in callback table. Make the best
+ * attempt to identify the thread in the error message.
+ */
+ ulong_t offset;
+ char *sym = kobj_getsymname((uintptr_t)tp->t_startpc,
+ &offset);
+
+ *thread_name = sym ? sym : "*unknown*";
+ }
+
+ mutex_exit(&ct->ct_lock);
+ return (ret_val);
+}
+#endif /* sun */
+
SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
new file mode 100644
index 000000000000..3c9ba51e095e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
@@ -0,0 +1,1402 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Fault Management Architecture (FMA) Resource and Protocol Support
+ *
+ * The routines contained herein provide services to support kernel subsystems
+ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
+ *
+ * Name-Value Pair Lists
+ *
+ * The embodiment of an FMA protocol element (event, fmri or authority) is a
+ * name-value pair list (nvlist_t). FMA-specific nvlist construtor and
+ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
+ * to create an nvpair list using custom allocators. Callers may choose to
+ * allocate either from the kernel memory allocator, or from a preallocated
+ * buffer, useful in constrained contexts like high-level interrupt routines.
+ *
+ * Protocol Event and FMRI Construction
+ *
+ * Convenience routines are provided to construct nvlist events according to
+ * the FMA Event Protocol and Naming Schema specification for ereports and
+ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
+ *
+ * ENA Manipulation
+ *
+ * Routines to generate ENA formats 0, 1 and 2 are available as well as
+ * routines to increment formats 1 and 2. Individual fields within the
+ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
+ * fm_ena_format_get() and fm_ena_gen_get().
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysevent.h>
+#include <sys/nvpair.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/compress.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/kstat.h>
+#include <sys/processor.h>
+#include <sys/pcpu.h>
+#include <sys/sunddi.h>
+#include <sys/systeminfo.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+
+/*
+ * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These
+ * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
+ */
+static const char *fm_url = "http://www.sun.com/msg";
+static const char *fm_msgid = "SUNOS-8000-0G";
+static char *volatile fm_panicstr = NULL;
+
+#ifdef sun
+errorq_t *ereport_errorq;
+#endif
+void *ereport_dumpbuf;
+size_t ereport_dumplen;
+
+static uint_t ereport_chanlen = ERPT_EVCH_MAX;
+static evchan_t *ereport_chan = NULL;
+static ulong_t ereport_qlen = 0;
+static size_t ereport_size = 0;
+static int ereport_cols = 80;
+
+extern void fastreboot_disable_highpil(void);
+
+/*
+ * Common fault management kstats to record ereport generation
+ * failures
+ */
+
+struct erpt_kstat {
+ kstat_named_t erpt_dropped; /* num erpts dropped on post */
+ kstat_named_t erpt_set_failed; /* num erpt set failures */
+ kstat_named_t fmri_set_failed; /* num fmri set failures */
+ kstat_named_t payload_set_failed; /* num payload set failures */
+};
+
+static struct erpt_kstat erpt_kstat_data = {
+ { "erpt-dropped", KSTAT_DATA_UINT64 },
+ { "erpt-set-failed", KSTAT_DATA_UINT64 },
+ { "fmri-set-failed", KSTAT_DATA_UINT64 },
+ { "payload-set-failed", KSTAT_DATA_UINT64 }
+};
+
+#ifdef sun
+/*ARGSUSED*/
+static void
+fm_drain(void *private, void *data, errorq_elem_t *eep)
+{
+ nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
+
+ if (!panicstr)
+ (void) fm_ereport_post(nvl, EVCH_TRYHARD);
+ else
+ fm_nvprint(nvl);
+}
+#endif
+
+void
+fm_init(void)
+{
+ kstat_t *ksp;
+
+#ifdef sun
+ (void) sysevent_evc_bind(FM_ERROR_CHAN,
+ &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
+
+ (void) sysevent_evc_control(ereport_chan,
+ EVCH_SET_CHAN_LEN, &ereport_chanlen);
+#endif
+
+ if (ereport_qlen == 0)
+ ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
+
+ if (ereport_size == 0)
+ ereport_size = ERPT_DATA_SZ;
+
+#ifdef sun
+ ereport_errorq = errorq_nvcreate("fm_ereport_queue",
+ (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
+ FM_ERR_PIL, ERRORQ_VITAL);
+ if (ereport_errorq == NULL)
+ panic("failed to create required ereport error queue");
+#endif
+
+ ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
+ ereport_dumplen = ereport_size;
+
+ /* Initialize ereport allocation and generation kstats */
+ ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
+ sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp != NULL) {
+ ksp->ks_data = &erpt_kstat_data;
+ kstat_install(ksp);
+ } else {
+ cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
+
+ }
+}
+
+#ifdef sun
+/*
+ * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
+ * output so they aren't split across console lines, and return the end column.
+ */
+/*PRINTFLIKE4*/
+static int
+fm_printf(int depth, int c, int cols, const char *format, ...)
+{
+ va_list ap;
+ int width;
+ char c1;
+
+ va_start(ap, format);
+ width = vsnprintf(&c1, sizeof (c1), format, ap);
+ va_end(ap);
+
+ if (c + width >= cols) {
+ console_printf("\n\r");
+ c = 0;
+ if (format[0] != ' ' && depth > 0) {
+ console_printf(" ");
+ c++;
+ }
+ }
+
+ va_start(ap, format);
+ console_vprintf(format, ap);
+ va_end(ap);
+
+ return ((c + width) % cols);
+}
+
+/*
+ * Recursively print a nvlist in the specified column width and return the
+ * column we end up in. This function is called recursively by fm_nvprint(),
+ * below. We generically format the entire nvpair using hexadecimal
+ * integers and strings, and elide any integer arrays. Arrays are basically
+ * used for cache dumps right now, so we suppress them so as not to overwhelm
+ * the amount of console output we produce at panic time. This can be further
+ * enhanced as FMA technology grows based upon the needs of consumers. All
+ * FMA telemetry is logged using the dump device transport, so the console
+ * output serves only as a fallback in case this procedure is unsuccessful.
+ */
+static int
+fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
+{
+ nvpair_t *nvp;
+
+ for (nvp = nvlist_next_nvpair(nvl, NULL);
+ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ data_type_t type = nvpair_type(nvp);
+ const char *name = nvpair_name(nvp);
+
+ boolean_t b;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+ nvlist_t *cnv;
+
+ if (strcmp(name, FM_CLASS) == 0)
+ continue; /* already printed by caller */
+
+ c = fm_printf(d, c, cols, " %s=", name);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ c = fm_printf(d + 1, c, cols, " 1");
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ c = fm_printf(d + 1, c, cols, b ? "1" : "0");
+ break;
+
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (void *)&i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (void *)&i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (void *)&i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ c = fm_printf(d + 1, c, cols, "\"%s\"",
+ str ? str : "<NULL>");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist(nvp, &cnv);
+ c = fm_nvprintr(cnv, d + 1, c, cols);
+ c = fm_printf(d + 1, c, cols, " ]");
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++) {
+ c = fm_nvprintr(val[i], d + 1, c, cols);
+ }
+ c = fm_printf(d + 1, c, cols, " ]");
+ }
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ c = fm_printf(d + 1, c, cols, "[...]");
+ break;
+ case DATA_TYPE_UNKNOWN:
+ c = fm_printf(d + 1, c, cols, "<unknown>");
+ break;
+ }
+ }
+
+ return (c);
+}
+
+void
+fm_nvprint(nvlist_t *nvl)
+{
+ char *class;
+ int c = 0;
+
+ console_printf("\r");
+
+ if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
+ c = fm_printf(0, c, ereport_cols, "%s", class);
+
+ if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
+ console_printf("\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Wrapper for panic() that first produces an FMA-style message for admins.
+ * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
+ * is the one exception to that rule and the only error that gets messaged.
+ * This function is intended for use by subsystems that have detected a fatal
+ * error and enqueued appropriate ereports and wish to then force a panic.
+ */
+/*PRINTFLIKE1*/
+void
+fm_panic(const char *format, ...)
+{
+ va_list ap;
+
+ (void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+#if defined(__i386) || defined(__amd64)
+ fastreboot_disable_highpil();
+#endif /* __i386 || __amd64 */
+ va_start(ap, format);
+ vpanic(format, ap);
+ va_end(ap);
+}
+
+/*
+ * Simply tell the caller if fm_panicstr is set, ie. an fma event has
+ * caused the panic. If so, something other than the default panic
+ * diagnosis method will diagnose the cause of the panic.
+ */
+int
+is_fm_panic()
+{
+ if (fm_panicstr)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Print any appropriate FMA banner message before the panic message. This
+ * function is called by panicsys() and prints the message for fm_panic().
+ * We print the message here so that it comes after the system is quiesced.
+ * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
+ * The rest of the message is for the console only and not needed in the log,
+ * so it is printed using console_printf(). We break it up into multiple
+ * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
+ */
+void
+fm_banner(void)
+{
+ timespec_t tod;
+ hrtime_t now;
+
+ if (!fm_panicstr)
+ return; /* panic was not initiated by fm_panic(); do nothing */
+
+ if (panicstr) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
+ "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
+
+ console_printf(
+"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
+"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
+ fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
+
+ console_printf(
+"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
+"SOURCE: %s, REV: %s %s\n",
+ platform, utsname.nodename, utsname.sysname,
+ utsname.release, utsname.version);
+
+ console_printf(
+"DESC: Errors have been detected that require a reboot to ensure system\n"
+"integrity. See %s/%s for more information.\n",
+ fm_url, fm_msgid);
+
+ console_printf(
+"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
+"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
+"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Utility function to write all of the pending ereports to the dump device.
+ * This function is called at either normal reboot or panic time, and simply
+ * iterates over the in-transit messages in the ereport sysevent channel.
+ */
+void
+fm_ereport_dump(void)
+{
+ evchanq_t *chq;
+ sysevent_t *sep;
+ erpt_dump_t ed;
+
+ timespec_t tod;
+ hrtime_t now;
+ char *buf;
+ size_t len;
+
+ if (panicstr) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ if (ereport_errorq != NULL)
+ errorq_drain(ereport_errorq);
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ /*
+ * In the panic case, sysevent_evc_walk_init() will return NULL.
+ */
+ if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
+ !panicstr)
+ return; /* event channel isn't initialized yet */
+
+ while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
+ if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
+ break;
+
+ ed.ed_magic = ERPT_MAGIC;
+ ed.ed_chksum = checksum32(buf, len);
+ ed.ed_size = (uint32_t)len;
+ ed.ed_pad = 0;
+ ed.ed_hrt_nsec = SE_TIME(sep);
+ ed.ed_hrt_base = now;
+ ed.ed_tod_base.sec = tod.tv_sec;
+ ed.ed_tod_base.nsec = tod.tv_nsec;
+
+ dumpvp_write(&ed, sizeof (ed));
+ dumpvp_write(buf, len);
+ }
+
+ sysevent_evc_walk_fini(chq);
+}
+#endif
+
+/*
+ * Post an error report (ereport) to the sysevent error channel. The error
+ * channel must be established with a prior call to sysevent_evc_create()
+ * before publication may occur.
+ */
+void
+fm_ereport_post(nvlist_t *ereport, int evc_flag)
+{
+ size_t nvl_size = 0;
+ evchan_t *error_chan;
+ sysevent_id_t eid;
+
+ (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
+ if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ return;
+ }
+
+#ifdef sun
+ if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
+ EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ return;
+ }
+
+ if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
+ SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ (void) sysevent_evc_unbind(error_chan);
+ return;
+ }
+ (void) sysevent_evc_unbind(error_chan);
+#else
+ (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
+#endif
+}
+
+/*
+ * Wrapppers for FM nvlist allocators
+ */
+/* ARGSUSED */
+static void *
+i_fm_alloc(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_zalloc(size, KM_SLEEP));
+}
+
+/* ARGSUSED */
+static void
+i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t fm_mem_alloc_ops = {
+ NULL,
+ NULL,
+ i_fm_alloc,
+ i_fm_free,
+ NULL
+};
+
+/*
+ * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
+ * to the newly allocated nv_alloc_t structure is returned upon success or NULL
+ * is returned to indicate that the nv_alloc structure could not be created.
+ */
+nv_alloc_t *
+fm_nva_xcreate(char *buf, size_t bufsz)
+{
+ nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+
+ return (nvhdl);
+}
+
+/*
+ * Destroy a previously allocated nv_alloc structure. The fixed buffer
+ * associated with nva must be freed by the caller.
+ */
+void
+fm_nva_xdestroy(nv_alloc_t *nva)
+{
+ nv_alloc_fini(nva);
+ kmem_free(nva, sizeof (nv_alloc_t));
+}
+
+/*
+ * Create a new nv list. A pointer to a new nv list structure is returned
+ * upon success or NULL is returned to indicate that the structure could
+ * not be created. The newly created nv list is created and managed by the
+ * operations installed in nva. If nva is NULL, the default FMA nva
+ * operations are installed and used.
+ *
+ * When called from the kernel and nva == NULL, this function must be called
+ * from passive kernel context with no locks held that can prevent a
+ * sleeping memory allocation from occurring. Otherwise, this function may
+ * be called from other kernel contexts as long a valid nva created via
+ * fm_nva_create() is supplied.
+ */
+nvlist_t *
+fm_nvlist_create(nv_alloc_t *nva)
+{
+ int hdl_alloced = 0;
+ nvlist_t *nvl;
+ nv_alloc_t *nvhdl;
+
+ if (nva == NULL) {
+ nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+ hdl_alloced = 1;
+ } else {
+ nvhdl = nva;
+ }
+
+ if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
+ if (hdl_alloced) {
+ nv_alloc_fini(nvhdl);
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ }
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Destroy a previously allocated nvlist structure. flag indicates whether
+ * or not the associated nva structure should be freed (FM_NVA_FREE) or
+ * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
+ * it to be re-used for future nvlist creation operations.
+ */
+void
+fm_nvlist_destroy(nvlist_t *nvl, int flag)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
+
+ nvlist_free(nvl);
+
+ if (nva != NULL) {
+ if (flag == FM_NVA_FREE)
+ fm_nva_xdestroy(nva);
+ }
+}
+
+int
+i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
+{
+ int nelem, ret = 0;
+ data_type_t type;
+
+ while (ret == 0 && name != NULL) {
+ type = va_arg(ap, data_type_t);
+ switch (type) {
+ case DATA_TYPE_BYTE:
+ ret = nvlist_add_byte(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_byte_array(payload, name,
+ va_arg(ap, uchar_t *), nelem);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ ret = nvlist_add_boolean_value(payload, name,
+ va_arg(ap, boolean_t));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_boolean_array(payload, name,
+ va_arg(ap, boolean_t *), nelem);
+ break;
+ case DATA_TYPE_INT8:
+ ret = nvlist_add_int8(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int8_array(payload, name,
+ va_arg(ap, int8_t *), nelem);
+ break;
+ case DATA_TYPE_UINT8:
+ ret = nvlist_add_uint8(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint8_array(payload, name,
+ va_arg(ap, uint8_t *), nelem);
+ break;
+ case DATA_TYPE_INT16:
+ ret = nvlist_add_int16(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int16_array(payload, name,
+ va_arg(ap, int16_t *), nelem);
+ break;
+ case DATA_TYPE_UINT16:
+ ret = nvlist_add_uint16(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint16_array(payload, name,
+ va_arg(ap, uint16_t *), nelem);
+ break;
+ case DATA_TYPE_INT32:
+ ret = nvlist_add_int32(payload, name,
+ va_arg(ap, int32_t));
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int32_array(payload, name,
+ va_arg(ap, int32_t *), nelem);
+ break;
+ case DATA_TYPE_UINT32:
+ ret = nvlist_add_uint32(payload, name,
+ va_arg(ap, uint32_t));
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint32_array(payload, name,
+ va_arg(ap, uint32_t *), nelem);
+ break;
+ case DATA_TYPE_INT64:
+ ret = nvlist_add_int64(payload, name,
+ va_arg(ap, int64_t));
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int64_array(payload, name,
+ va_arg(ap, int64_t *), nelem);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvlist_add_uint64(payload, name,
+ va_arg(ap, uint64_t));
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint64_array(payload, name,
+ va_arg(ap, uint64_t *), nelem);
+ break;
+ case DATA_TYPE_STRING:
+ ret = nvlist_add_string(payload, name,
+ va_arg(ap, char *));
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_string_array(payload, name,
+ va_arg(ap, char **), nelem);
+ break;
+ case DATA_TYPE_NVLIST:
+ ret = nvlist_add_nvlist(payload, name,
+ va_arg(ap, nvlist_t *));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_nvlist_array(payload, name,
+ va_arg(ap, nvlist_t **), nelem);
+ break;
+ default:
+ ret = EINVAL;
+ }
+
+ name = va_arg(ap, char *);
+ }
+ return (ret);
+}
+
+void
+fm_payload_set(nvlist_t *payload, ...)
+{
+ int ret;
+ const char *name;
+ va_list ap;
+
+ va_start(ap, payload);
+ name = va_arg(ap, char *);
+ ret = i_fm_payload_set(payload, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_add_64(
+ &erpt_kstat_data.payload_set_failed.value.ui64, 1);
+}
+
+/*
+ * Set-up and validate the members of an ereport event according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * class string ereport
+ * version uint8_t 0
+ * ena uint64_t <ena>
+ * detector nvlist_t <detector>
+ * ereport-payload nvlist_t <var args>
+ *
+ * We don't actually add a 'version' member to the payload. Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else. Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
+ */
+void
+fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
+ uint64_t ena, const nvlist_t *detector, ...)
+{
+ char ereport_class[FM_MAX_CLASS];
+ const char *name;
+ va_list ap;
+ int ret;
+
+ if (version != FM_EREPORT_VERS0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ return;
+ }
+
+ (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
+ FM_EREPORT_CLASS, erpt_class);
+ if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ }
+
+ if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
+ (nvlist_t *)detector) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ }
+
+ va_start(ap, detector);
+ name = va_arg(ap, const char *);
+ ret = i_fm_payload_set(ereport, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+}
+
+/*
+ * Set-up and validate the members of an hc fmri according to;
+ *
+ * Member name Type Value
+ * ===================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * hc-name string <name>
+ * hc-id string <id>
+ *
+ * Note that auth and hc-id are optional members.
+ */
+
+#define HC_MAXPAIRS 20
+#define HC_MAXNAMELEN 50
+
+static int
+fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
+{
+ if (version != FM_HC_SCHEME_VERSION) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
+ nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ return (1);
+}
+
+void
+fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ va_list ap;
+ int i;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = 0; i < npairs; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+ va_end(ap);
+
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
+ for (i = 0; i < npairs; i++)
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+}
+
+/*
+ * Set-up and validate the members of an dev fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * devpath string <devpath>
+ * [devid] string <devid>
+ * [target-port-l0id] string <target-port-lun0-id>
+ *
+ * Note that auth and devid are optional members.
+ */
+void
+fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
+ const char *devpath, const char *devid, const char *tpl0)
+{
+ int err = 0;
+
+ if (version != DEV_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
+
+ if (auth != NULL) {
+ err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth);
+ }
+
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
+
+ if (devid != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+ if (tpl0 != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+ if (err)
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
+}
+
+/*
+ * Set-up and validate the members of an cpu fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * cpuid uint32_t <cpu_id>
+ * cpumask uint8_t <cpu_mask>
+ * serial uint64_t <serial_id>
+ *
+ * Note that auth, cpumask, serial are optional members.
+ *
+ */
+void
+fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
+ uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
+{
+ uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
+
+ if (version < CPU_SCHEME_VERSION1) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
+ FM_FMRI_SCHEME_CPU) != 0) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
+ *cpu_maskp) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
+ FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
+ atomic_add_64(failedp, 1);
+}
+
+/*
+ * Set-up and validate the members of a mem according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth> [optional]
+ * unum string <unum>
+ * serial string <serial> [optional*]
+ * offset uint64_t <offset> [optional]
+ *
+ * * serial is required if offset is present
+ */
+void
+fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ const char *unum, const char *serial, uint64_t offset)
+{
+ if (version != MEM_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (!serial && (offset != (uint64_t)-1)) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (auth != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+
+ if (serial != NULL) {
+ if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
+ (char **)&serial, 1) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ if (offset != (uint64_t)-1) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET,
+ offset) != 0) {
+ atomic_add_64(&erpt_kstat_data.
+ fmri_set_failed.value.ui64, 1);
+ }
+ }
+ }
+}
+
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+ uint64_t vdev_guid)
+{
+ if (version != ZFS_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+
+ if (vdev_guid != 0) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+}
+
+uint64_t
+fm_ena_increment(uint64_t ena)
+{
+ uint64_t new_ena;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
+ break;
+ case FM_ENA_FMT2:
+ new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
+ break;
+ default:
+ new_ena = 0;
+ }
+
+ return (new_ena);
+}
+
+uint64_t
+fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
+{
+ uint64_t ena = 0;
+
+ switch (format) {
+ case FM_ENA_FMT1:
+ if (timestamp) {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((timestamp << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ } else {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ }
+ break;
+ case FM_ENA_FMT2:
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
+ break;
+ default:
+ break;
+ }
+
+ return (ena);
+}
+
+uint64_t
+fm_ena_generate(uint64_t timestamp, uchar_t format)
+{
+ return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
+}
+
+uint64_t
+fm_ena_generation_get(uint64_t ena)
+{
+ uint64_t gen;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
+ break;
+ default:
+ gen = 0;
+ break;
+ }
+
+ return (gen);
+}
+
+uchar_t
+fm_ena_format_get(uint64_t ena)
+{
+
+ return (ENA_FORMAT(ena));
+}
+
+uint64_t
+fm_ena_id_get(uint64_t ena)
+{
+ uint64_t id;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
+ break;
+ default:
+ id = 0;
+ }
+
+ return (id);
+}
+
+uint64_t
+fm_ena_time_get(uint64_t ena)
+{
+ uint64_t time;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
+ break;
+ default:
+ time = 0;
+ }
+
+ return (time);
+}
+
+#ifdef sun
+/*
+ * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
+ * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
+ */
+void
+fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
+{
+ int i;
+ char *sym;
+ ulong_t off;
+ char *stkpp[FM_STK_DEPTH];
+ char buf[FM_STK_DEPTH * FM_SYM_SZ];
+ char *stkp = buf;
+
+ for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
+ if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
+ (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
+ else
+ (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
+ stkpp[i] = stkp;
+ }
+
+ fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
+ DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
+}
+#endif
+
+#ifdef sun
+void
+print_msg_hwerr(ctid_t ct_id, proc_t *p)
+{
+ uprintf("Killed process %d (%s) in contract id %d "
+ "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
+}
+#endif
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ nvlist_t **hcl;
+ uint_t n;
+ int i, j;
+ va_list ap;
+ char *hcname, *hcid;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ /*
+ * copy the bboard nvpairs to the pairs array
+ */
+ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+ != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+ &hcname) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+
+ /*
+ * create the pairs from passed in pairs
+ */
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = n; i < npairs + n; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+ va_end(ap);
+
+ /*
+ * Create the fmri hc list
+ */
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+ npairs + n) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ for (i = 0; i < npairs + n; i++) {
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+ }
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
index b0ec0639781a..991978e41d19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
@@ -37,7 +37,7 @@
#undef _SYS_ACL_H
#include_next <sys/acl.h>
#define _SYS_ACL_H
-#endif /* _KERNEL */
+#endif /* _KERNEL */
#ifdef __cplusplus
extern "C" {
@@ -57,7 +57,7 @@ typedef struct ace {
uint16_t a_type; /* allow or deny */
} ace_t;
-#if !defined(_KERNEL)
+#ifndef _KERNEL
typedef struct acl_info acl_t;
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
index 878ddcbeb09f..8718f5bcf63f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
@@ -47,7 +47,7 @@ typedef enum acl_type {
} zfs_acl_type_t;
struct acl_info {
- zfs_acl_type_t acl_type; /* style of acl */
+ zfs_acl_type_t acl_type; /* style of acl */
int acl_cnt; /* number of acl entries */
int acl_entry_size; /* sizeof acl entry */
int acl_flags; /* special flags about acl */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
index 02263a5a0cf1..ba305c908239 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _AVL_H
#define _AVL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This is a private header file. Applications should not directly include
* this file.
@@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree,
* node - node that has the value being looked for
* where - position for use with avl_nearest() or avl_insert(), may be NULL
*/
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
/*
* Insert a node into the tree.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h b/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h
deleted file mode 100644
index a2bab580a54b..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#ifndef _SYS_BYTEORDER_H
-#define _SYS_BYTEORDER_H
-
-#include <sys/isa_defs.h>
-#include <sys/int_types.h>
-
-#if defined(__GNUC__) && defined(_ASM_INLINES) && \
- (defined(__i386) || defined(__amd64))
-#include <asm/byteorder.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * macros for conversion between host and (internet) network byte order
- */
-
-#if BYTE_ORDER == _BIG_ENDIAN && !defined(ntohl) && !defined(__lint)
-/* big-endian */
-#define ntohl(x) (x)
-#define ntohs(x) (x)
-#define htonl(x) (x)
-#define htons(x) (x)
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-#define ntohll(x) (x)
-#define htonll(x) (x)
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-
-#elif !defined(ntohl) /* little-endian */
-
-#ifndef _IN_PORT_T
-#define _IN_PORT_T
-typedef uint16_t in_port_t;
-#endif
-
-#ifndef _IN_ADDR_T
-#define _IN_ADDR_T
-typedef uint32_t in_addr_t;
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5)
-extern uint32_t htonl(uint32_t);
-extern uint16_t htons(uint16_t);
-extern uint32_t ntohl(uint32_t);
-extern uint16_t ntohs(uint16_t);
-#else
-extern in_addr_t htonl(in_addr_t);
-extern in_port_t htons(in_port_t);
-extern in_addr_t ntohl(in_addr_t);
-extern in_port_t ntohs(in_port_t);
-#endif /* !_XPG4_2 || __EXTENSIONS__ || _XPG5 */
-
-#if defined(_LP64) || defined(_LONGLONG_TYPE)
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-extern uint64_t htonll(uint64_t);
-extern uint64_t ntohll(uint64_t);
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-#endif /* _LP64 || _LONGLONG_TYPE */
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-
-/*
- * Macros to reverse byte order
- */
-#define BSWAP_8(x) ((x) & 0xff)
-#if !defined(__i386) && !defined(__amd64)
-#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
-#define BSWAP_32(x) (((uint32_t)(x) << 24) | \
- (((uint32_t)(x) << 8) & 0xff0000) | \
- (((uint32_t)(x) >> 8) & 0xff00) | \
- ((uint32_t)(x) >> 24))
-#else /* x86 */
-#define BSWAP_16(x) htons(x)
-#define BSWAP_32(x) htonl(x)
-#endif /* !__i386 && !__amd64 */
-
-#if defined(_LP64) || defined(_LONGLONG_TYPE)
-#if (!defined(__i386) && !defined(__amd64))
-#define BSWAP_64(x) (((uint64_t)(x) << 56) | \
- (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
- (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
- (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
- (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
- (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
- (((uint64_t)(x) >> 40) & 0xff00ULL) | \
- ((uint64_t)(x) >> 56))
-#else /* x86 */
-#define BSWAP_64(x) htonll(x)
-#endif /* !__i386 && !__amd64 */
-#else /* no uint64_t */
-#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
-#endif /* _LP64 || _LONGLONG_TYPE */
-
-#define BMASK_8(x) ((x) & 0xff)
-#define BMASK_16(x) ((x) & 0xffff)
-#define BMASK_32(x) ((x) & 0xffffffff)
-#define BMASK_64(x) (x)
-
-/*
- * Macros to convert from a specific byte order to/from native byte order
- */
-#if BYTE_ORDER == _BIG_ENDIAN
-#define BE_8(x) BMASK_8(x)
-#define BE_16(x) BMASK_16(x)
-#define BE_32(x) BMASK_32(x)
-#define BE_64(x) BMASK_64(x)
-#define LE_8(x) BSWAP_8(x)
-#define LE_16(x) BSWAP_16(x)
-#define LE_32(x) BSWAP_32(x)
-#define LE_64(x) BSWAP_64(x)
-#else
-#define LE_8(x) BMASK_8(x)
-#define LE_16(x) BMASK_16(x)
-#define LE_32(x) BMASK_32(x)
-#define LE_64(x) BMASK_64(x)
-#define BE_8(x) BSWAP_8(x)
-#define BE_16(x) BSWAP_16(x)
-#define BE_32(x) BSWAP_32(x)
-#define BE_64(x) BSWAP_64(x)
-#endif
-
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_BYTEORDER_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
index 2b9ac3f98388..43f14ebb369d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_CALLB_H
#define _SYS_CALLB_H
-#pragma ident "@(#)callb.h 1.29 05/06/23 SMI"
-
#include <sys/kcondvar.h>
#ifdef __cplusplus
@@ -68,7 +65,8 @@ extern "C" {
#define CB_CL_MDBOOT CB_CL_UADMIN
#define CB_CL_ENTER_DEBUGGER 14
#define CB_CL_CPR_POST_KERNEL 15
-#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
+#define CB_CL_CPU_DEEP_IDLE 16
+#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */
/*
* CB_CL_CPR_DAEMON class specific definitions are given below:
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
index b9e0da4e1993..b44dda5e8418 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_CPUPART_H
#define _SYS_CPUPART_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/processor.h>
#include <sys/cpuvar.h>
@@ -58,16 +55,6 @@ typedef int cpupartid_t;
#define CP_ALL 0 /* return all cpu partitions */
#define CP_NONEMPTY 1 /* return only non-empty ones */
-#if defined(_MACHDEP)
-struct mach_cpupart {
- cpuset_t mc_haltset;
-};
-
-extern struct mach_cpupart cp_default_mach;
-#else
-struct mach_cpupart;
-#endif
-
typedef struct cpupart {
disp_t cp_kp_queue; /* partition-wide kpreempt queue */
cpupartid_t cp_id; /* partition ID */
@@ -103,8 +90,7 @@ typedef struct cpupart {
lgrp_gen_t cp_gen; /* generation number */
lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */
bitset_t cp_cmt_pgs; /* CMT PGs represented */
-
- struct mach_cpupart *cp_mach; /* mach-specific */
+ bitset_t cp_haltset; /* halted CPUs */
} cpupart_t;
typedef struct cpupart_kstat {
@@ -138,6 +124,15 @@ extern cpupart_t *cp_list_head;
extern uint_t cp_numparts;
extern uint_t cp_numparts_nonempty;
+/*
+ * Each partition contains a bitset that indicates which CPUs are halted and
+ * which ones are running. Given the growing number of CPUs in current and
+ * future platforms, it's important to fanout each CPU within its partition's
+ * haltset to prevent contention due to false sharing. The fanout factor
+ * is platform specific, and declared accordingly.
+ */
+extern uint_t cp_haltset_fanout;
+
extern void cpupart_initialize_default();
extern cpupart_t *cpupart_find(psetid_t);
extern int cpupart_create(psetid_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
index 0a038e00d0e4..d4075d580be7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_CPUVAR_H
@@ -168,7 +167,7 @@ typedef struct cpu {
ftrace_data_t cpu_ftrace; /* per cpu ftrace data */
- clock_t cpu_deadman_lbolt; /* used by deadman() */
+ clock_t cpu_deadman_counter; /* used by deadman() */
uint_t cpu_deadman_countdown; /* used by deadman() */
kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */
@@ -211,12 +210,27 @@ typedef struct cpu {
uint64_t cpu_curr_clock; /* current clock freq in Hz */
char *cpu_supp_freqs; /* supported freqs in Hz */
+ uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */
+ uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */
+
/*
* Interrupt load factor used by dispatcher & softcall
*/
hrtime_t cpu_intrlast; /* total interrupt time (nsec) */
int cpu_intrload; /* interrupt load factor (0-99%) */
+ uint_t cpu_rotor; /* for cheap pseudo-random numbers */
+
+ struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */
+
+ /*
+ * cpu_generation is updated whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation are protected by cpu_lock.
+ *
+ * See CPU_NEW_GENERATION() macro below.
+ */
+ volatile uint_t cpu_generation; /* tracking on/off-line */
+
/*
* New members must be added /before/ this member, as the CTF tools
* rely on this being the last field before cpu_m, so they can
@@ -238,12 +252,13 @@ typedef struct cpu {
* is up to the platform to assure that this is performed properly. Note that
* the structure is sized to avoid false sharing.
*/
-#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uintptr_t) + \
- sizeof (kmutex_t))
+#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \
+ sizeof (uintptr_t) + sizeof (kmutex_t))
#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
typedef struct cpu_core {
uint16_t cpuc_dtrace_flags; /* DTrace flags */
+ uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */
uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */
uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */
kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */
@@ -261,6 +276,28 @@ extern cpu_core_t cpu_core[];
*/
#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
+/*
+ * Check to see if an interrupt thread might be active at a given ipl.
+ * If so return true.
+ * We must be conservative--it is ok to give a false yes, but a false no
+ * will cause disaster. (But if the situation changes after we check it is
+ * ok--the caller is trying to ensure that an interrupt routine has been
+ * exited).
+ * This is used when trying to remove an interrupt handler from an autovector
+ * list in avintr.c.
+ */
+#define INTR_ACTIVE(cpup, level) \
+ ((level) <= LOCK_LEVEL ? \
+ ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
+
+/*
+ * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
+ * looks at it. It's meant as a cheap mechanism to be incorporated in routines
+ * wanting to avoid biasing, but where true randomness isn't needed (just
+ * something that changes).
+ */
+#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
+
#if defined(_KERNEL) || defined(_KMEMUSER)
#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE)
@@ -352,7 +389,6 @@ extern cpu_core_t cpu_core[];
#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */
#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
-
#endif /* _KERNEL || _KMEMUSER */
#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
@@ -516,6 +552,7 @@ extern cpuset_t cpu_seqid_inuse;
#if defined(_KERNEL) || defined(_KMEMUSER)
extern struct cpu *cpu[]; /* indexed by CPU number */
+extern struct cpu **cpu_seq; /* indexed by sequential CPU id */
extern cpu_t *cpu_list; /* list of CPUs */
extern cpu_t *cpu_active; /* list of active CPUs */
extern int ncpus; /* number of CPUs present */
@@ -526,6 +563,7 @@ extern int boot_ncpus; /* # cpus present @ boot */
extern processorid_t max_cpuid; /* maximum CPU number */
extern struct cpu *cpu_inmotion; /* offline or partition move target */
extern cpu_t *clock_cpu_list;
+extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */
#if defined(__i386) || defined(__amd64)
extern struct cpu *curcpup(void);
@@ -569,6 +607,13 @@ extern struct cpu *curcpup(void);
#define CPU_STATS(cp, stat) \
((cp)->cpu_stats.stat)
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++)
+
#endif /* _KERNEL || _KMEMUSER */
/*
@@ -658,6 +703,7 @@ int cpu_get_state(cpu_t *); /* get current cpu state */
const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */
+void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */
void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
/* frequencies */
@@ -697,6 +743,49 @@ void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
*/
extern kmutex_t cpu_lock; /* lock protecting CPU data */
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ * Event, described by cpu_setup_t
+ * CPU ID
+ * Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ * CPU_INIT A new CPU is started and added to the list of active CPUs
+ * This event is only used during boot
+ *
+ * CPU_CONFIG A newly inserted CPU is prepared for starting running code
+ * This event is called by DR code
+ *
+ * CPU_UNCONFIG CPU has been powered off and needs cleanup
+ * This event is called by DR code
+ *
+ * CPU_ON CPU is enabled but does not run anything yet
+ *
+ * CPU_INTR_ON CPU is enabled and has interrupts enabled
+ *
+ * CPU_OFF CPU is going offline but can still run threads
+ *
+ * CPU_CPUPART_OUT CPU is going to move out of its partition
+ *
+ * CPU_CPUPART_IN CPU is going to move to a new partition
+ *
+ * CPU_SETUP CPU is set up during boot and can run threads
+ */
typedef enum {
CPU_INIT,
CPU_CONFIG,
@@ -704,7 +793,9 @@ typedef enum {
CPU_ON,
CPU_OFF,
CPU_CPUPART_IN,
- CPU_CPUPART_OUT
+ CPU_CPUPART_OUT,
+ CPU_SETUP,
+ CPU_INTR_ON
} cpu_setup_t;
typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
@@ -718,6 +809,13 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
extern void cpu_state_change_notify(int, cpu_setup_t);
/*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
+/*
* Create various strings that describe the given CPU for the
* processor_info system call and configuration-related kstats.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
index e84f1e04305d..5056f9a51105 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,8 +34,6 @@
#ifndef _SYS_CRED_H
#define _SYS_CRED_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#ifdef __cplusplus
@@ -58,6 +56,7 @@ struct prcred;
struct ksid;
struct ksidlist;
struct credklpd;
+struct credgrp;
struct auditinfo_addr; /* cred.h is included in audit.h */
@@ -79,6 +78,7 @@ extern cred_t *crdup(cred_t *);
extern void crdup_to(cred_t *, cred_t *);
extern cred_t *crgetcred(void);
extern void crset(struct proc *, cred_t *);
+extern void crset_zone_privall(cred_t *);
extern int groupmember(gid_t, const cred_t *);
extern int supgroupmember(gid_t, const cred_t *);
extern int hasprocperm(const cred_t *, const cred_t *);
@@ -104,6 +104,7 @@ extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
extern uint_t crgetref(const cred_t *);
extern const gid_t *crgetgroups(const cred_t *);
+extern const gid_t *crgetggroups(const struct credgrp *);
extern int crgetngroups(const cred_t *);
@@ -120,7 +121,13 @@ extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
*/
extern int crsetugid(cred_t *, uid_t, gid_t);
+/*
+ * Functions to handle the supplemental group list.
+ */
extern int crsetgroups(cred_t *, int, gid_t *);
+extern struct credgrp *crgrpcopyin(int, gid_t *);
+extern void crgrprele(struct credgrp *);
+extern void crsetcredgrp(cred_t *, struct credgrp *);
/*
* Private interface for setting zone association of credential.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
index 5fabb14a290e..6467781ce806 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
@@ -19,18 +19,18 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
-
#ifndef _SYS_DEBUG_H
#define _SYS_DEBUG_H
#include <sys/types.h>
+#include <sys/note.h>
#ifdef __cplusplus
extern "C" {
@@ -73,6 +73,25 @@ extern int assfail();
#endif
/*
+ * IMPLY and EQUIV are assertions of the form:
+ *
+ * if (a) then (b)
+ * and
+ * if (a) then (b) *AND* if (b) then (a)
+ */
+#ifdef DEBUG
+#define IMPLY(A, B) \
+ ((void)(((!(A)) || (B)) || \
+ assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__)))
+#define EQUIV(A, B) \
+ ((void)((!!(A) == !!(B)) || \
+ assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__)))
+#else
+#define IMPLY(A, B) ((void)0)
+#define EQUIV(A, B) ((void)0)
+#endif
+
+/*
* ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
* and prints out the values of the left and right hand expressions as part of
* the panic message to ease debugging. The three variants imply the type
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
index 21b7dbe52c11..c752edc99bbd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
@@ -68,6 +68,18 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
#define FM_EREPORT_FAILMODE_WAIT "wait"
#define FM_EREPORT_FAILMODE_CONTINUE "continue"
@@ -75,6 +87,7 @@ extern "C" {
#define FM_RESOURCE_REMOVED "removed"
#define FM_RESOURCE_AUTOREPLACE "autoreplace"
+#define FM_RESOURCE_STATECHANGE "statechange"
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
index 20c07890fad4..f5f93421bd74 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FM_PROTOCOL_H
@@ -43,11 +42,13 @@ extern "C" {
#define FM_CLASS "class"
#define FM_VERSION "version"
-/* FM event class values */
+/* FM protocol category 1 class names */
#define FM_EREPORT_CLASS "ereport"
#define FM_FAULT_CLASS "fault"
+#define FM_DEFECT_CLASS "defect"
#define FM_RSRC_CLASS "resource"
#define FM_LIST_EVENT "list"
+#define FM_IREPORT_CLASS "ireport"
/* FM list.* event class values */
#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
@@ -71,6 +72,12 @@ extern "C" {
/* list.* event payload member names */
#define FM_LIST_EVENT_SIZE "list-sz"
+/* ireport.* event payload member names */
+#define FM_IREPORT_DETECTOR "detector"
+#define FM_IREPORT_UUID "uuid"
+#define FM_IREPORT_PRIORITY "pri"
+#define FM_IREPORT_ATTRIBUTES "attr"
+
/*
* list.suspect, isolated, updated, repaired and resolved
* versions/payload member names.
@@ -82,9 +89,11 @@ extern "C" {
#define FM_SUSPECT_FAULT_LIST "fault-list"
#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
#define FM_SUSPECT_FAULT_STATUS "fault-status"
+#define FM_SUSPECT_INJECTED "__injected"
#define FM_SUSPECT_MESSAGE "message"
#define FM_SUSPECT_RETIRE "retire"
#define FM_SUSPECT_RESPONSE "response"
+#define FM_SUSPECT_SEVERITY "severity"
#define FM_SUSPECT_VERS0 0
#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
@@ -120,6 +129,7 @@ extern "C" {
#define FM_RSRC_ASRU_REPAIRED "repaired"
#define FM_RSRC_ASRU_REPLACED "replaced"
#define FM_RSRC_ASRU_ACQUITTED "acquitted"
+#define FM_RSRC_ASRU_RESOLVED "resolved"
#define FM_RSRC_ASRU_UNUSABLE "unusable"
#define FM_RSRC_ASRU_EVENT "event"
@@ -128,6 +138,8 @@ extern "C" {
#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
#define FM_RSRC_XPRT_UUID "uuid"
#define FM_RSRC_XPRT_SUBCLASS "subclass"
+#define FM_RSRC_XPRT_FAULT_STATUS "fault-status"
+#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru"
/*
* FM ENA Format Macros
@@ -166,6 +178,7 @@ extern "C" {
/* FMRI authority-type member names */
#define FM_FMRI_AUTH_CHASSIS "chassis-id"
+#define FM_FMRI_AUTH_PRODUCT_SN "product-sn"
#define FM_FMRI_AUTH_PRODUCT "product-id"
#define FM_FMRI_AUTH_DOMAIN "domain-id"
#define FM_FMRI_AUTH_SERVER "server-id"
@@ -185,6 +198,7 @@ extern "C" {
#define FM_FMRI_SCHEME_PKG "pkg"
#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
#define FM_FMRI_SCHEME_ZFS "zfs"
+#define FM_FMRI_SCHEME_SW "sw"
/* Scheme versions */
#define FMD_SCHEME_VERSION0 0
@@ -204,8 +218,12 @@ extern "C" {
#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
#define LEGACY_SCHEME_VERSION0 0
#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
+#define SVC_SCHEME_VERSION0 0
+#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0
#define ZFS_SCHEME_VERSION0 0
#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
+#define SW_SCHEME_VERSION0 0
+#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0
/* hc scheme member names */
#define FM_FMRI_HC_SERIAL_ID "serial"
@@ -237,6 +255,7 @@ extern "C" {
/* dev scheme member names */
#define FM_FMRI_DEV_ID "devid"
+#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id"
#define FM_FMRI_DEV_PATH "device-path"
/* pkg scheme member names */
@@ -245,14 +264,13 @@ extern "C" {
#define FM_FMRI_PKG_VERSION "pkg-version"
/* svc scheme member names */
-#define FM_FMRI_SVC_NAME "service-name"
-#define FM_FMRI_SVC_VERSION "service-version"
-#define FM_FMRI_SVC_INSTANCE "instance"
-#define FM_FMRI_SVC_CONTRACT_ID "contract-id"
+#define FM_FMRI_SVC_NAME "svc-name"
+#define FM_FMRI_SVC_INSTANCE "svc-instance"
+#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id"
/* svc-authority member names */
#define FM_FMRI_SVC_AUTH_SCOPE "scope"
-#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN"
+#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn"
/* cpu scheme member names */
#define FM_FMRI_CPU_ID "cpuid"
@@ -290,6 +308,25 @@ extern "C" {
#define FM_FMRI_ZFS_POOL "pool"
#define FM_FMRI_ZFS_VDEV "vdev"
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define FM_FMRI_SW_OBJ "object"
+#define FM_FMRI_SW_OBJ_PATH "path"
+#define FM_FMRI_SW_OBJ_ROOT "root"
+#define FM_FMRI_SW_OBJ_PKG "pkg"
+#define FM_FMRI_SW_SITE "site"
+#define FM_FMRI_SW_SITE_TOKEN "token"
+#define FM_FMRI_SW_SITE_MODULE "module"
+#define FM_FMRI_SW_SITE_FILE "file"
+#define FM_FMRI_SW_SITE_LINE "line"
+#define FM_FMRI_SW_SITE_FUNC "func"
+#define FM_FMRI_SW_CTXT "context"
+#define FM_FMRI_SW_CTXT_ORIGIN "origin"
+#define FM_FMRI_SW_CTXT_EXECNAME "execname"
+#define FM_FMRI_SW_CTXT_PID "pid"
+#define FM_FMRI_SW_CTXT_ZONE "zone"
+#define FM_FMRI_SW_CTXT_CTID "ctid"
+#define FM_FMRI_SW_CTXT_STACK "stack"
+
extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
extern void fm_nva_xdestroy(nv_alloc_t *);
@@ -306,7 +343,7 @@ extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
int, ...);
extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
- const char *);
+ const char *, const char *);
extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
uint8_t *, const char *);
@@ -315,6 +352,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
const char *, const char *);
extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+ nvlist_t *, int, ...);
extern uint64_t fm_ena_increment(uint64_t);
extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
index cd176f008d3f..7d7b94977afa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FM_UTIL_H
#define _SYS_FM_UTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -94,6 +91,7 @@ extern void fm_banner(void);
extern void fm_ereport_dump(void);
extern void fm_ereport_post(nvlist_t *, int);
+extern int is_fm_panic();
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index 8400dc1e93c4..edc26cde394d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -18,11 +18,13 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_FS_ZFS_H
#define _SYS_FS_ZFS_H
@@ -52,6 +54,10 @@ typedef enum {
#define ZFS_TYPE_DATASET \
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN (1024 * 8)
+#define ZAP_OLDMAXVALUELEN 1024
+
/*
* Dataset properties are identified by these constants and must be added to
* the end of this list to ensure that external consumers are not affected
@@ -83,12 +89,11 @@ typedef enum {
ZFS_PROP_READONLY,
ZFS_PROP_ZONED,
ZFS_PROP_SNAPDIR,
- ZFS_PROP_ACLMODE,
+ ZFS_PROP_PRIVATE, /* not exposed to user, temporary */
ZFS_PROP_ACLINHERIT,
ZFS_PROP_CREATETXG, /* not exposed to the user */
ZFS_PROP_NAME, /* not exposed to the user */
ZFS_PROP_CANMOUNT,
- ZFS_PROP_SHAREISCSI,
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
ZFS_PROP_XATTR,
ZFS_PROP_NUMCLONES, /* not exposed to the user */
@@ -110,6 +115,15 @@ typedef enum {
ZFS_PROP_USEDCHILD,
ZFS_PROP_USEDREFRESERV,
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
+ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
+ ZFS_PROP_DEFER_DESTROY,
+ ZFS_PROP_USERREFS,
+ ZFS_PROP_LOGBIAS,
+ ZFS_PROP_UNIQUE, /* not exposed to the user */
+ ZFS_PROP_OBJSETID, /* not exposed to the user */
+ ZFS_PROP_DEDUP,
+ ZFS_PROP_MLSLABEL,
+ ZFS_PROP_SYNC,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -132,8 +146,6 @@ extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
typedef enum {
ZPOOL_PROP_NAME,
ZPOOL_PROP_SIZE,
- ZPOOL_PROP_USED,
- ZPOOL_PROP_AVAILABLE,
ZPOOL_PROP_CAPACITY,
ZPOOL_PROP_ALTROOT,
ZPOOL_PROP_HEALTH,
@@ -145,6 +157,12 @@ typedef enum {
ZPOOL_PROP_CACHEFILE,
ZPOOL_PROP_FAILUREMODE,
ZPOOL_PROP_LISTSNAPS,
+ ZPOOL_PROP_AUTOEXPAND,
+ ZPOOL_PROP_DEDUPDITTO,
+ ZPOOL_PROP_DEDUPRATIO,
+ ZPOOL_PROP_FREE,
+ ZPOOL_PROP_ALLOCATED,
+ ZPOOL_PROP_READONLY,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -159,10 +177,27 @@ typedef enum {
ZPROP_SRC_DEFAULT = 0x2,
ZPROP_SRC_TEMPORARY = 0x4,
ZPROP_SRC_LOCAL = 0x8,
- ZPROP_SRC_INHERITED = 0x10
+ ZPROP_SRC_INHERITED = 0x10,
+ ZPROP_SRC_RECEIVED = 0x20
} zprop_source_t;
-#define ZPROP_SRC_ALL 0x1f
+#define ZPROP_SRC_ALL 0x3f
+
+#define ZPROP_SOURCE_VAL_RECVD "$recvd"
+#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define ZPROP_HAS_RECVD "$hasrecvd"
+
+typedef enum {
+ ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;
typedef int (*zprop_func)(int, void *);
@@ -184,9 +219,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
const char *zfs_prop_to_name(zfs_prop_t);
zfs_prop_t zfs_name_to_prop(const char *);
boolean_t zfs_prop_user(const char *);
-boolean_t zfs_prop_userquota(const char *name);
+boolean_t zfs_prop_userquota(const char *);
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
/*
@@ -199,6 +235,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t);
boolean_t zpool_prop_readonly(zpool_prop_t);
int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
/*
* Definitions for the Delegation.
@@ -229,6 +266,8 @@ typedef enum {
#define ZFS_DELEG_PERM_GID "gid"
#define ZFS_DELEG_PERM_GROUPS "groups"
+#define ZFS_MLSLABEL_DEFAULT "none"
+
#define ZFS_SMB_ACL_SRC "src"
#define ZFS_SMB_ACL_TARGET "target"
@@ -238,6 +277,11 @@ typedef enum {
ZFS_CANMOUNT_NOAUTO = 2
} zfs_canmount_type_t;
+typedef enum {
+ ZFS_LOGBIAS_LATENCY = 0,
+ ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
typedef enum zfs_share_op {
ZFS_SHARE_NFS = 0,
ZFS_UNSHARE_NFS = 1,
@@ -258,6 +302,12 @@ typedef enum zfs_cache_type {
ZFS_CACHE_ALL = 2
} zfs_cache_type_t;
+typedef enum {
+ ZFS_SYNC_STANDARD = 0,
+ ZFS_SYNC_ALWAYS = 1,
+ ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+
/*
* On-disk version number.
@@ -277,14 +327,28 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_13 13ULL
#define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
+#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
+#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
+#define SPA_VERSION_27 27ULL
+#define SPA_VERSION_28 28ULL
+
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
- * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_15
-#define SPA_VERSION_STRING "15"
+#define SPA_VERSION SPA_VERSION_28
+#define SPA_VERSION_STRING "28"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -300,8 +364,8 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_INITIAL SPA_VERSION_1
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
#define SPA_VERSION_SPARES SPA_VERSION_3
-#define SPA_VERSION_RAID6 SPA_VERSION_3
-#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
+#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
+#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
@@ -321,6 +385,20 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -328,14 +406,15 @@ typedef enum zfs_cache_type {
* also update the version_table[] and help message in zfs_prop.c.
*
* When changing, be sure to teach GRUB how to read the new format!
- * See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*}
+ * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
*/
#define ZPL_VERSION_1 1ULL
#define ZPL_VERSION_2 2ULL
#define ZPL_VERSION_3 3ULL
#define ZPL_VERSION_4 4ULL
-#define ZPL_VERSION ZPL_VERSION_4
-#define ZPL_VERSION_STRING "4"
+#define ZPL_VERSION_5 5ULL
+#define ZPL_VERSION ZPL_VERSION_5
+#define ZPL_VERSION_STRING "5"
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
@@ -343,6 +422,23 @@ typedef enum zfs_cache_type {
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
+#define ZPL_VERSION_SA ZPL_VERSION_5
+
+/* Rewind request information */
+#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
+#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
+#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
+#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
+#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
+#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
+#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+ uint32_t zrp_request; /* rewind behavior requested */
+ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */
+ uint64_t zrp_maxdata; /* max acceptable data errors */
+ uint64_t zrp_txg; /* specific txg to load */
+} zpool_rewind_policy_t;
/*
* The following are configuration names used in the nvlist describing a pool's
@@ -367,7 +463,8 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_ASHIFT "ashift"
#define ZPOOL_CONFIG_ASIZE "asize"
#define ZPOOL_CONFIG_DTL "DTL"
-#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
@@ -376,13 +473,28 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
+#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
#define ZPOOL_CONFIG_UNSPARE "unspare"
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
+#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
+#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
+#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
+#define ZPOOL_CONFIG_SPLIT "splitcfg"
+#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
+#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
+#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
+#define ZPOOL_CONFIG_REMOVING "removing"
+#define ZPOOL_CONFIG_RESILVERING "resilvering"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
+#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
+#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -393,6 +505,19 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_DEGRADED "degraded"
#define ZPOOL_CONFIG_REMOVED "removed"
#define ZPOOL_CONFIG_FRU "fru"
+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
+
+/* Rewind policy parameters */
+#define ZPOOL_REWIND_POLICY "rewind-policy"
+#define ZPOOL_REWIND_REQUEST "rewind-request"
+#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg"
+#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh"
+#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh"
+
+/* Rewind data discovered */
+#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
+#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
+#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
@@ -401,6 +526,7 @@ typedef enum zfs_cache_type {
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"
@@ -450,7 +576,9 @@ typedef enum vdev_aux {
VDEV_AUX_SPARED, /* hot spare used in another pool */
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
- VDEV_AUX_BAD_LOG /* cannot read log chain(s) */
+ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
+ VDEV_AUX_EXTERNAL, /* external diagnosis */
+ VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */
} vdev_aux_t;
/*
@@ -471,14 +599,14 @@ typedef enum pool_state {
} pool_state_t;
/*
- * Scrub types.
+ * Scan Functions.
*/
-typedef enum pool_scrub_type {
- POOL_SCRUB_NONE,
- POOL_SCRUB_RESILVER,
- POOL_SCRUB_EVERYTHING,
- POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+ POOL_SCAN_NONE,
+ POOL_SCAN_SCRUB,
+ POOL_SCAN_RESILVER,
+ POOL_SCAN_FUNCS
+} pool_scan_func_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -494,6 +622,36 @@ typedef enum zio_type {
} zio_type_t;
/*
+ * Pool statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+ /* values stored on disk */
+ uint64_t pss_func; /* pool_scan_func_t */
+ uint64_t pss_state; /* dsl_scan_state_t */
+ uint64_t pss_start_time; /* scan start time */
+ uint64_t pss_end_time; /* scan end time */
+ uint64_t pss_to_examine; /* total bytes to scan */
+ uint64_t pss_examined; /* total examined bytes */
+ uint64_t pss_to_process; /* total bytes to process */
+ uint64_t pss_processed; /* total processed bytes */
+ uint64_t pss_errors; /* scan errors */
+
+ /* values not stored on disk */
+ uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_start; /* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+ DSS_NONE,
+ DSS_SCANNING,
+ DSS_FINISHED,
+ DSS_CANCELED,
+ DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
+/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -511,34 +669,50 @@ typedef struct vdev_stat {
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
uint64_t vs_self_healed; /* self-healed bytes */
- uint64_t vs_scrub_type; /* pool_scrub_type_t */
- uint64_t vs_scrub_complete; /* completed? */
- uint64_t vs_scrub_examined; /* bytes examined; top */
- uint64_t vs_scrub_repaired; /* bytes repaired; leaf */
- uint64_t vs_scrub_errors; /* errors during scrub */
- uint64_t vs_scrub_start; /* UTC scrub start time */
- uint64_t vs_scrub_end; /* UTC scrub end time */
+ uint64_t vs_scan_removing; /* removing? */
+ uint64_t vs_scan_processed; /* scan processed bytes */
} vdev_stat_t;
+/*
+ * DDT statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+ uint64_t ddo_count; /* number of elments in ddt */
+ uint64_t ddo_dspace; /* size of ddt on disk */
+ uint64_t ddo_mspace; /* size of ddt in-core */
+} ddt_object_t;
+
+typedef struct ddt_stat {
+ uint64_t dds_blocks; /* blocks */
+ uint64_t dds_lsize; /* logical size */
+ uint64_t dds_psize; /* physical size */
+ uint64_t dds_dsize; /* deflated allocated size */
+ uint64_t dds_ref_blocks; /* referenced blocks */
+ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
+ uint64_t dds_ref_psize; /* referenced psize * refcnt */
+ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+ ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
+} ddt_histogram_t;
+
#define ZVOL_DRIVER "zvol"
#define ZFS_DRIVER "zfs"
#define ZFS_DEV_NAME "zfs"
#define ZFS_DEV "/dev/" ZFS_DEV_NAME
-/*
- * zvol paths. Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix. Below are the names without /dev.
- */
-#define ZVOL_DEV_DIR "zvol"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
-#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:"
-#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/"
+/* general zvol path */
+#define ZVOL_DIR "/dev/zvol"
+/* expansion */
+#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
+/* for dump and swap */
+#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
+#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"
#define ZVOL_PROP_NAME "name"
+#define ZVOL_DEFAULT_BLOCKSIZE 8192
/*
* /dev/zfs ioctl numbers.
@@ -554,7 +728,7 @@ typedef unsigned long zfs_ioc_t;
#define ZFS_IOC_POOL_CONFIGS _IOWR('Z', 4, struct zfs_cmd)
#define ZFS_IOC_POOL_STATS _IOWR('Z', 5, struct zfs_cmd)
#define ZFS_IOC_POOL_TRYIMPORT _IOWR('Z', 6, struct zfs_cmd)
-#define ZFS_IOC_POOL_SCRUB _IOWR('Z', 7, struct zfs_cmd)
+#define ZFS_IOC_POOL_SCAN _IOWR('Z', 7, struct zfs_cmd)
#define ZFS_IOC_POOL_FREEZE _IOWR('Z', 8, struct zfs_cmd)
#define ZFS_IOC_POOL_UPGRADE _IOWR('Z', 9, struct zfs_cmd)
#define ZFS_IOC_POOL_GET_HISTORY _IOWR('Z', 10, struct zfs_cmd)
@@ -564,52 +738,60 @@ typedef unsigned long zfs_ioc_t;
#define ZFS_IOC_VDEV_ATTACH _IOWR('Z', 14, struct zfs_cmd)
#define ZFS_IOC_VDEV_DETACH _IOWR('Z', 15, struct zfs_cmd)
#define ZFS_IOC_VDEV_SETPATH _IOWR('Z', 16, struct zfs_cmd)
-#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 17, struct zfs_cmd)
-#define ZFS_IOC_OBJSET_ZPLPROPS _IOWR('Z', 18, struct zfs_cmd)
-#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 19, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
-#define ZFS_IOC_SET_PROP _IOWR('Z', 21, struct zfs_cmd)
-#define ZFS_IOC_CREATE_MINOR _IOWR('Z', 22, struct zfs_cmd)
-#define ZFS_IOC_REMOVE_MINOR _IOWR('Z', 23, struct zfs_cmd)
-#define ZFS_IOC_CREATE _IOWR('Z', 24, struct zfs_cmd)
-#define ZFS_IOC_DESTROY _IOWR('Z', 25, struct zfs_cmd)
-#define ZFS_IOC_ROLLBACK _IOWR('Z', 26, struct zfs_cmd)
-#define ZFS_IOC_RENAME _IOWR('Z', 27, struct zfs_cmd)
-#define ZFS_IOC_RECV _IOWR('Z', 28, struct zfs_cmd)
-#define ZFS_IOC_SEND _IOWR('Z', 29, struct zfs_cmd)
-#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 30, struct zfs_cmd)
-#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 31, struct zfs_cmd)
-#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 32, struct zfs_cmd)
-#define ZFS_IOC_ERROR_LOG _IOWR('Z', 33, struct zfs_cmd)
-#define ZFS_IOC_CLEAR _IOWR('Z', 34, struct zfs_cmd)
-#define ZFS_IOC_PROMOTE _IOWR('Z', 35, struct zfs_cmd)
-#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 36, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT _IOWR('Z', 37, struct zfs_cmd)
-#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 38, struct zfs_cmd)
-#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 39, struct zfs_cmd)
-#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 40, struct zfs_cmd)
-#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 41, struct zfs_cmd)
-#define ZFS_IOC_SET_FSACL _IOWR('Z', 42, struct zfs_cmd)
-#define ZFS_IOC_GET_FSACL _IOWR('Z', 43, struct zfs_cmd)
-#define ZFS_IOC_ISCSI_PERM_CHECK _IOWR('Z', 44, struct zfs_cmd)
-#define ZFS_IOC_SHARE _IOWR('Z', 45, struct zfs_cmd)
-#define ZFS_IOC_INHERIT_PROP _IOWR('Z', 46, struct zfs_cmd)
-#define ZFS_IOC_JAIL _IOWR('Z', 47, struct zfs_cmd)
-#define ZFS_IOC_UNJAIL _IOWR('Z', 48, struct zfs_cmd)
-#define ZFS_IOC_SMB_ACL _IOWR('Z', 49, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_ONE _IOWR('Z', 50, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_MANY _IOWR('Z', 51, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_UPGRADE _IOWR('Z', 52, struct zfs_cmd)
-#define ZFS_IOC_SETFRU _IOWR('Z', 53, struct zfs_cmd)
+#define ZFS_IOC_VDEV_SETFRU _IOWR('Z', 17, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 18, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_ZPLPROPS _IOWR('Z', 19, struct zfs_cmd)
+#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 21, struct zfs_cmd)
+#define ZFS_IOC_SET_PROP _IOWR('Z', 22, struct zfs_cmd)
+#define ZFS_IOC_CREATE _IOWR('Z', 23, struct zfs_cmd)
+#define ZFS_IOC_DESTROY _IOWR('Z', 24, struct zfs_cmd)
+#define ZFS_IOC_ROLLBACK _IOWR('Z', 25, struct zfs_cmd)
+#define ZFS_IOC_RENAME _IOWR('Z', 26, struct zfs_cmd)
+#define ZFS_IOC_RECV _IOWR('Z', 27, struct zfs_cmd)
+#define ZFS_IOC_SEND _IOWR('Z', 28, struct zfs_cmd)
+#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 29, struct zfs_cmd)
+#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 30, struct zfs_cmd)
+#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 31, struct zfs_cmd)
+#define ZFS_IOC_ERROR_LOG _IOWR('Z', 32, struct zfs_cmd)
+#define ZFS_IOC_CLEAR _IOWR('Z', 33, struct zfs_cmd)
+#define ZFS_IOC_PROMOTE _IOWR('Z', 34, struct zfs_cmd)
+#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 35, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT _IOWR('Z', 36, struct zfs_cmd)
+#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 37, struct zfs_cmd)
+#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 38, struct zfs_cmd)
+#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 39, struct zfs_cmd)
+#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 40, struct zfs_cmd)
+#define ZFS_IOC_SET_FSACL _IOWR('Z', 41, struct zfs_cmd)
+#define ZFS_IOC_GET_FSACL _IOWR('Z', 42, struct zfs_cmd)
+#define ZFS_IOC_SHARE _IOWR('Z', 43, struct zfs_cmd)
+#define ZFS_IOC_INHERIT_PROP _IOWR('Z', 44, struct zfs_cmd)
+#define ZFS_IOC_SMB_ACL _IOWR('Z', 45, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_ONE _IOWR('Z', 46, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_MANY _IOWR('Z', 47, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_UPGRADE _IOWR('Z', 48, struct zfs_cmd)
+#define ZFS_IOC_HOLD _IOWR('Z', 49, struct zfs_cmd)
+#define ZFS_IOC_RELEASE _IOWR('Z', 50, struct zfs_cmd)
+#define ZFS_IOC_GET_HOLDS _IOWR('Z', 51, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_RECVD_PROPS _IOWR('Z', 52, struct zfs_cmd)
+#define ZFS_IOC_VDEV_SPLIT _IOWR('Z', 53, struct zfs_cmd)
+#define ZFS_IOC_NEXT_OBJ _IOWR('Z', 54, struct zfs_cmd)
+#define ZFS_IOC_DIFF _IOWR('Z', 55, struct zfs_cmd)
+#define ZFS_IOC_TMP_SNAPSHOT _IOWR('Z', 56, struct zfs_cmd)
+#define ZFS_IOC_OBJ_TO_STATS _IOWR('Z', 57, struct zfs_cmd)
+#define ZFS_IOC_JAIL _IOWR('Z', 58, struct zfs_cmd)
+#define ZFS_IOC_UNJAIL _IOWR('Z', 59, struct zfs_cmd)
/*
* Internal SPA load state. Used by FMA diagnosis engine.
*/
typedef enum {
- SPA_LOAD_NONE, /* no load in progress */
- SPA_LOAD_OPEN, /* normal open */
- SPA_LOAD_IMPORT, /* import in progress */
- SPA_LOAD_TRYIMPORT /* tryimport in progress */
+ SPA_LOAD_NONE, /* no load in progress */
+ SPA_LOAD_OPEN, /* normal open */
+ SPA_LOAD_IMPORT, /* import in progress */
+ SPA_LOAD_TRYIMPORT, /* tryimport in progress */
+ SPA_LOAD_RECOVER, /* recovery requested */
+ SPA_LOAD_ERROR /* load failed */
} spa_load_state_t;
/*
@@ -641,9 +823,19 @@ typedef enum {
#define ZFS_ONLINE_CHECKREMOVE 0x1
#define ZFS_ONLINE_UNSPARE 0x2
#define ZFS_ONLINE_FORCEFAULT 0x4
+#define ZFS_ONLINE_EXPAND 0x8
#define ZFS_OFFLINE_TEMPORARY 0x1
/*
+ * Flags for ZFS_IOC_POOL_IMPORT
+ */
+#define ZFS_IMPORT_NORMAL 0x0
+#define ZFS_IMPORT_VERBATIM 0x1
+#define ZFS_IMPORT_ANY_HOST 0x2
+#define ZFS_IMPORT_MISSING_LOG 0x4
+#define ZFS_IMPORT_ONLY 0x8
+
+/*
* Sysevent payload members. ZFS will generate the following sysevents with the
* given payloads:
*
@@ -671,7 +863,7 @@ typedef enum {
/*
* Note: This is encoded on-disk, so new events must be added to the
* end, and unused events can not be removed. Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
*/
typedef enum history_internal_events {
LOG_NO_EVENT = 0,
@@ -688,7 +880,7 @@ typedef enum history_internal_events {
LOG_POOL_VDEV_OFFLINE,
LOG_POOL_UPGRADE,
LOG_POOL_CLEAR,
- LOG_POOL_SCRUB,
+ LOG_POOL_SCAN,
LOG_POOL_PROPSET,
LOG_DS_CREATE,
LOG_DS_CLONE,
@@ -711,7 +903,10 @@ typedef enum history_internal_events {
LOG_DS_UPGRADE,
LOG_DS_REFQUOTA,
LOG_DS_REFRESERV,
- LOG_POOL_SCRUB_DONE,
+ LOG_POOL_SCAN_DONE,
+ LOG_DS_USER_HOLD,
+ LOG_DS_USER_RELEASE,
+ LOG_POOL_SPLIT,
LOG_END
} history_internal_events_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
new file mode 100644
index 000000000000..36c9eaa7f18e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZUT_H
+#define _ZUT_H
+
+/*
+ * IOCTLs for the zfs unit test driver
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define ZUT_DRIVER "zut"
+#define ZUT_DEV "/dev/zut"
+
+#define ZUT_VERSION_STRING "1"
+
+/*
+ * /dev/zut ioctl numbers.
+ */
+#define ZUT_IOC ('U' << 8)
+
+/* Request flags */
+#define ZUT_IGNORECASE 0x01
+#define ZUT_ACCFILTER 0x02
+#define ZUT_XATTR 0x04
+#define ZUT_EXTRDDIR 0x08
+#define ZUT_GETSTAT 0x10
+
+typedef struct zut_lookup {
+ int zl_reqflags;
+ int zl_deflags; /* output */
+ int zl_retcode; /* output */
+ char zl_dir[MAXPATHLEN];
+ char zl_file[MAXNAMELEN];
+ char zl_xfile[MAXNAMELEN];
+ char zl_real[MAXPATHLEN]; /* output */
+ uint64_t zl_xvattrs; /* output */
+ struct stat64 zl_statbuf; /* output */
+} zut_lookup_t;
+
+typedef struct zut_readdir {
+ uint64_t zr_buf; /* pointer to output buffer */
+ uint64_t zr_loffset; /* output */
+ char zr_dir[MAXPATHLEN];
+ char zr_file[MAXNAMELEN];
+ int zr_reqflags;
+ int zr_retcode; /* output */
+ int zr_eof; /* output */
+ uint_t zr_bytes; /* output */
+ uint_t zr_buflen;
+} zut_readdir_t;
+
+typedef enum zut_ioc {
+ ZUT_IOC_MIN_CMD = ZUT_IOC - 1,
+ ZUT_IOC_LOOKUP = ZUT_IOC,
+ ZUT_IOC_READDIR,
+ ZUT_IOC_MAX_CMD
+} zut_ioc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZUT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
index 97f7ed66ae09..f3fc634fde71 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
@@ -20,15 +20,16 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * These are Consolidation Private interfaces and are subject to change.
*/
#ifndef _SYS_GFS_H
#define _SYS_GFS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/mutex.h>
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
index 3a405e409708..39eeb905c72b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
@@ -19,14 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_IDMAP_H
#define _SYS_IDMAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
/* Idmap status codes */
#define IDMAP_SUCCESS 0
@@ -64,12 +63,13 @@
#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970
#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969
#define IDMAP_ERR_BAD_UTF8 -9968
-#define IDMAP_ERR_NONEGENERATED -9967
+#define IDMAP_ERR_NONE_GENERATED -9967
#define IDMAP_ERR_PROP_UNKNOWN -9966
#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965
#define IDMAP_ERR_NS_LDAP_PARTIAL -9964
#define IDMAP_ERR_NS_LDAP_CFG -9963
#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962
+#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961
/* Reserved GIDs for some well-known SIDs */
#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */
@@ -90,4 +90,8 @@
*/
#define IDMAP_MAX_DOOR_RPC (256 * 1024)
+#define IDMAP_SENTINEL_PID UINT32_MAX
+#define IDMAP_ID_IS_EPHEMERAL(pid) \
+ (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID))
+
#endif /* _SYS_IDMAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
index 19faf42f7311..f5156148bcbd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
@@ -452,6 +452,12 @@ extern "C" {
#elif defined(__powerpc__)
+#if defined(__BIG_ENDIAN__)
+#define _BIT_FIELDS_HTOL
+#else
+#define _BIT_FIELDS_LTOH
+#endif
+
/*
* The following set of definitions characterize the Solaris on SPARC systems.
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
index cc578674318a..abf84cf593e9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_NVPAIR_H
#define _SYS_NVPAIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/time.h>
#include <sys/errno.h>
@@ -160,6 +157,8 @@ int nvlist_unpack(char *, size_t, nvlist_t **, int);
int nvlist_dup(nvlist_t *, nvlist_t **, int);
int nvlist_merge(nvlist_t *, nvlist_t *, int);
+uint_t nvlist_nvflag(nvlist_t *);
+
int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
@@ -199,6 +198,7 @@ int nvlist_add_double(nvlist_t *, const char *, double);
int nvlist_remove(nvlist_t *, const char *, data_type_t);
int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
int nvlist_lookup_boolean(nvlist_t *, const char *);
int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
@@ -237,9 +237,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
int *, char **);
boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);
/* processing nvpair */
nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
char *nvpair_name(nvpair_t *);
data_type_t nvpair_type(nvpair_t *);
int nvpair_type_is_array(nvpair_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
index 3a76c8c9b420..c0fe6e21b85f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
@@ -32,8 +32,6 @@
#ifndef _SYS_PROCESSOR_H
#define _SYS_PROCESSOR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/procset.h>
@@ -140,6 +138,7 @@ extern lgrpid_t gethomelgroup();
* Internal interface prototypes
*/
extern int p_online_internal(processorid_t, int, int *);
+extern int p_online_internal_locked(processorid_t, int, int *);
#endif /* !_KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
index 0a61e41de849..3558c2a7f03d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,14 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SYSEVENT_H
#define _SYS_SYSEVENT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#ifdef __cplusplus
@@ -164,18 +161,50 @@ typedef struct sysevent_value {
#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */
/*
- * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to
- * the consolidation private interface.
+ * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to
+ * the consolidation private interface, so flags defined here are restricted
+ * to the LSB.
+ *
+ * EVCH_SUB_KEEP indicates that this subscription should persist even if
+ * this subscriber id should die unexpectedly; matching events will be
+ * queued (up to a limit) and will be delivered if/when we restart again
+ * with the same subscriber id.
+ */
+#define EVCH_SUB_KEEP 0x01
+
+/*
+ * Subscriptions may be wildcarded, but we limit the number of
+ * wildcards permitted.
+ */
+#define EVCH_WILDCARD_MAX 10
+
+/*
+ * Used in unsubscribe to indicate all subscriber ids for a channel.
*/
-#define EVCH_SUB_KEEP 0x0001
#define EVCH_ALLSUB "all_subs"
/*
* Meaning of flags parameter of channel bind function
+ *
+ * EVCH_CREAT indicates to create a channel if not already present.
+ *
+ * EVCH_HOLD_PEND indicates that events should be published to this
+ * channel even if there are no matching subscribers present; when
+ * a subscriber belatedly binds to the channel and registers their
+ * subscriptions they will receive events that predate their bind.
+ * If the channel is closed, however, with no remaining bindings then
+ * the channel is destroyed.
+ *
+ * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND -
+ * even if the channel has no remaining bindings it will not be
+ * destroyed so long as events remain unconsumed. This is suitable for
+ * use with short-lived event producers that may bind to (create) the
+ * channel and exit before the intended consumer has started.
*/
-#define EVCH_CREAT 0x0001 /* Create a channel if not present */
+#define EVCH_CREAT 0x0001
#define EVCH_HOLD_PEND 0x0002
-#define EVCH_B_FLAGS 0x0003 /* All valid bits */
+#define EVCH_HOLD_PEND_INDEF 0x0004
+#define EVCH_B_FLAGS 0x0007 /* All valid bits */
/*
* Meaning of commands of evc_control function
@@ -185,38 +214,71 @@ typedef struct sysevent_value {
#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
+#ifdef sun
/*
- * Event channel interface definitions
+ * Shared user/kernel event channel interface definitions
*/
-int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
-void sysevent_evc_unbind(evchan_t *);
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+extern int sysevent_evc_unbind(evchan_t *);
+extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
int (*)(sysevent_t *, void *), void *, uint32_t);
-void sysevent_evc_unsubscribe(evchan_t *, const char *);
-int sysevent_evc_publish(evchan_t *, const char *, const char *,
+extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
+extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
const char *, const char *, nvlist_t *, uint32_t);
-int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
+extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
+#endif /* sun */
-#ifdef _KERNEL
+#ifndef _KERNEL
+
+#ifdef sun
+/*
+ * Userland-only event channel interfaces
+ */
+
+#include <door.h>
+
+typedef struct sysevent_subattr sysevent_subattr_t;
+
+extern sysevent_subattr_t *sysevent_subattr_alloc(void);
+extern void sysevent_subattr_free(sysevent_subattr_t *);
+
+extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *);
+extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *);
+
+extern void sysevent_subattr_thrcreate(sysevent_subattr_t *,
+ door_xcreate_server_func_t *, void *);
+extern void sysevent_subattr_thrsetup(sysevent_subattr_t *,
+ door_xcreate_thrsetup_func_t *, void *);
+
+extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
+ int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
+#endif /* sun */
+
+#else
/*
* Kernel log_event interfaces.
*/
-int log_sysevent(sysevent_t *, int, sysevent_id_t *);
-
-sysevent_t *sysevent_alloc(char *, char *, char *, int);
-void sysevent_free(sysevent_t *);
-int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int);
-void sysevent_free_attr(sysevent_attr_list_t *);
-int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
-void sysevent_detach_attributes(sysevent_t *);
-char *sysevent_get_class_name(sysevent_t *);
-char *sysevent_get_subclass_name(sysevent_t *);
-uint64_t sysevent_get_seq(sysevent_t *);
-void sysevent_get_time(sysevent_t *, hrtime_t *);
-size_t sysevent_get_size(sysevent_t *);
-char *sysevent_get_pub(sysevent_t *);
-int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+extern int log_sysevent(sysevent_t *, int, sysevent_id_t *);
+
+extern sysevent_t *sysevent_alloc(char *, char *, char *, int);
+extern void sysevent_free(sysevent_t *);
+extern int sysevent_add_attr(sysevent_attr_list_t **, char *,
+ sysevent_value_t *, int);
+extern void sysevent_free_attr(sysevent_attr_list_t *);
+extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
+extern void sysevent_detach_attributes(sysevent_t *);
+#ifdef sun
+extern char *sysevent_get_class_name(sysevent_t *);
+extern char *sysevent_get_subclass_name(sysevent_t *);
+extern uint64_t sysevent_get_seq(sysevent_t *);
+extern void sysevent_get_time(sysevent_t *, hrtime_t *);
+extern size_t sysevent_get_size(sysevent_t *);
+extern char *sysevent_get_pub(sysevent_t *);
+extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+#endif /* sun */
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
new file mode 100644
index 000000000000..9d3107d09011
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSEVENT_DEV_H
+#define _SYS_SYSEVENT_DEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/sysevent/eventdefs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Event schema for EC_DEV_ADD/ESC_DISK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_DISK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Event Class - EC_DEV_BRANCH
+ * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path to the root node of the device subtree
+ * without the "/devices" prefix.
+ */
+
+#define EV_VERSION "version"
+#define DEV_PHYS_PATH "phys_path"
+#define DEV_NAME "dev_name"
+#define DEV_DRIVER_NAME "driver_name"
+#define DEV_INSTANCE "instance"
+#define DEV_PROP_PREFIX "prop-"
+
+#define EV_V1 1
+
+/* maximum number of devinfo node properties added to the event */
+#define MAX_PROP_COUNT 100
+
+/* only properties with size less than PROP_LEN_LIMIT are added to the event */
+#define PROP_LEN_LIMIT 1024
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_DEV_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
index c46223f76b18..dfa78179bb5f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SYSEVENT_EVENTDEFS_H
@@ -52,6 +51,7 @@ extern "C" {
#define EC_FM "EC_fm" /* FMA error report event */
#define EC_ZFS "EC_zfs" /* ZFS event */
#define EC_DATALINK "EC_datalink" /* datalink event */
+#define EC_VRRP "EC_vrrp" /* VRRP event */
/*
* The following event class is reserved for exclusive use
@@ -179,6 +179,8 @@ extern "C" {
/* Interface within an IPMP group has changed state or type */
#define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change"
+/* IPMP probe has changed state */
+#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state"
/*
* EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes
@@ -200,9 +202,16 @@ extern "C" {
/* device tree branch removed */
#define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove"
-/* device capacity dynamically changed */
+/*
+ * EC_DEV_STATUS subclass definitions
+ *
+ * device capacity dynamically changed
+ */
#define ESC_DEV_DLE "ESC_dev_dle"
+/* LUN has received an eject request from the user */
+#define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request"
+
/* FMA Fault and Error event protocol subclass */
#define ESC_FM_ERROR "ESC_FM_error"
#define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay"
@@ -223,26 +232,43 @@ extern "C" {
#define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up"
#define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down"
+/* EC_ACPIEV subclass definitions */
+#define EC_ACPIEV "EC_acpiev"
+#define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch"
+#define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock"
+#define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep"
+#define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute"
+#define ESC_ACPIEV_WIFI "ESC_acpiev_wifi"
+#define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad"
+
/*
* ZFS subclass definitions. supporting attributes (name/value paris) are found
* in sys/fs/zfs.h
*/
-#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
-#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
-#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
-#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
-#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
-#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
-#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
-#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
-#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
-#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare"
+#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
+#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
+#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
+#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
+#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
+#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
+#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
+#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
+#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
+#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare"
+#define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach"
+#define ESC_ZFS_VDEV_AUTOEXPAND "ESC_ZFS_vdev_autoexpand"
/*
* datalink subclass definitions.
*/
#define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */
+/*
+ * VRRP subclass definitions. Supporting attributes (name/value paris) are
+ * found in sys/sysevent/vrrp.h
+ */
+#define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change"
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
index 22f9fe36601d..18f20908191f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
@@ -31,6 +31,7 @@
#define _SYS_SYSMACROS_H
#include <sys/param.h>
+#include <sys/isa_defs.h>
#ifdef __cplusplus
extern "C" {
@@ -57,6 +58,9 @@ extern "C" {
#ifndef ABS
#define ABS(a) ((a) < 0 ? -(a) : (a))
#endif
+#ifndef SIGNOF
+#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0)
+#endif
#ifdef _KERNEL
@@ -108,7 +112,7 @@ extern unsigned char bcd_to_byte[256];
#define L_MAXMIN L_MAXMIN32
#endif
-#if defined(sun)
+#ifdef sun
#ifdef _KERNEL
/* major part of a device internal to the kernel */
@@ -168,7 +172,6 @@ extern unsigned char bcd_to_byte[256];
#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
#define geteminor(x) (minor_t)((x) & L_MAXMIN)
-
#endif /* sun */
/*
@@ -371,6 +374,41 @@ extern unsigned char bcd_to_byte[256];
#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
#endif
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+static __inline int
+highbit(ulong_t i)
+{
+ register int h = 1;
+
+ if (i == 0)
+ return (0);
+#ifdef _LP64
+ if (i & 0xffffffff00000000ul) {
+ h += 32; i >>= 32;
+ }
+#endif
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
index 3878ded5e973..fb3f76d7765c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
@@ -45,6 +45,8 @@ typedef struct taskq taskq_t;
typedef uintptr_t taskqid_t;
typedef void (task_func_t)(void *);
+struct proc;
+
/*
* Public flags for taskq_create(): bit range 0-15
*/
@@ -52,6 +54,7 @@ typedef void (task_func_t)(void *);
#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */
+#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */
/*
* Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
@@ -61,6 +64,7 @@ typedef void (task_func_t)(void *);
#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
+#define TQ_FRONT 0x08 /* Put task at the front of the queue */
#ifdef _KERNEL
@@ -72,6 +76,10 @@ extern void taskq_mp_init(void);
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int,
int, uint_t);
+extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
+ struct proc *, uint_t);
+extern taskq_t *taskq_create_sysdc(const char *, int, int, int,
+ struct proc *, uint_t, uint_t);
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void nulltask(void *);
extern void taskq_destroy(taskq_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
index d60721c5a786..77c9c0bf84fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
@@ -36,6 +36,30 @@
extern "C" {
#endif
+#ifdef sun
+/*
+ * Unicode encoding conversion functions and their macros.
+ */
+#define UCONV_IN_BIG_ENDIAN 0x0001
+#define UCONV_OUT_BIG_ENDIAN 0x0002
+#define UCONV_IN_SYSTEM_ENDIAN 0x0004
+#define UCONV_OUT_SYSTEM_ENDIAN 0x0008
+#define UCONV_IN_LITTLE_ENDIAN 0x0010
+#define UCONV_OUT_LITTLE_ENDIAN 0x0020
+#define UCONV_IGNORE_NULL 0x0040
+#define UCONV_IN_ACCEPT_BOM 0x0080
+#define UCONV_OUT_EMIT_BOM 0x0100
+
+extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *,
+ int);
+extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *,
+ int);
+extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
+extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
+#endif /* sun */
+
/*
* UTF-8 text preparation functions and their macros.
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
index ab95b99b9ce6..974c915dd553 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
@@ -18,9 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -41,10 +41,6 @@
#include_next <sys/vnode.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
#define IS_DEVVP(vp) \
((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
@@ -69,6 +65,10 @@ typedef struct xoptattr {
uint8_t xoa_av_quarantined;
uint8_t xoa_av_modified;
uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+ uint8_t xoa_reparse;
+ uint64_t xoa_generation;
+ uint8_t xoa_offline;
+ uint8_t xoa_sparse;
} xoptattr_t;
/*
@@ -209,11 +209,15 @@ typedef struct xvattr {
#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */
#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */
#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */
+#define XAT0_REPARSE 0x00002000 /* FS reparse point */
+#define XAT0_GEN 0x00004000 /* object generation number */
+#define XAT0_OFFLINE 0x00008000 /* offline */
+#define XAT0_SPARSE 0x00010000 /* sparse */
#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
- XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| \
- XAT0_AV_MODIFIED|XAT0_AV_SCANSTAMP)
+ XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \
+ XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
/* Support for XAT_* optional attributes */
#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */
@@ -246,6 +250,10 @@ typedef struct xvattr {
#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
+#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
+#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
+#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
+#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
/*
* The returned attribute map array (xva_rtnattrmap[]) is located past the
@@ -305,7 +313,6 @@ typedef struct xvattr {
#define MODEMASK 07777 /* mode bits plus permission bits */
#define PERMMASK 00777 /* permission bits */
-
/*
* VOP_ACCESS flags
*/
@@ -358,15 +365,12 @@ typedef struct caller_context {
ulong_t cc_flags;
} caller_context_t;
-/*
- * Structure tags for function prototypes, defined elsewhere.
- */
struct taskq;
/*
* Flags for VOP_LOOKUP
*
- * Defined in file.h, but also possible, FIGNORECASE
+ * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
*
*/
#define LOOKUP_DIR 0x01 /* want parent dir vp */
diff --git a/sys/modules/opensolaris/Makefile b/sys/modules/opensolaris/Makefile
index ca4f12a39f6f..54dcfe6c9d4c 100644
--- a/sys/modules/opensolaris/Makefile
+++ b/sys/modules/opensolaris/Makefile
@@ -6,7 +6,8 @@ KMOD= opensolaris
SRCS= opensolaris.c \
opensolaris_cmn_err.c \
opensolaris_kmem.c \
- opensolaris_misc.c
+ opensolaris_misc.c \
+ opensolaris_sunddi.c
_A=${.CURDIR}/../../cddl/contrib/opensolaris/common/atomic
.if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S)
diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile
index d0a6f44bde3e..53271f49efbf 100644
--- a/sys/modules/zfs/Makefile
+++ b/sys/modules/zfs/Makefile
@@ -2,7 +2,7 @@
KMOD= zfs
-SRCS= vnode_if.h
+SRCS= bus_if.h device_if.h vnode_if.h
SUNW= ${.CURDIR}/../../cddl/contrib/opensolaris
@@ -12,6 +12,7 @@ SRCS+= acl_common.c
SRCS+= avl.c
.PATH: ${SUNW}/common/nvpair
SRCS+= nvpair.c
+SRCS+= nvpair_alloc_fixed.c
.PATH: ${.CURDIR}/../../cddl/contrib/opensolaris/common/unicode
SRCS+= u8_textprep.c
@@ -22,6 +23,7 @@ SRCS+= opensolaris_kstat.c
SRCS+= opensolaris_lookup.c
SRCS+= opensolaris_policy.c
SRCS+= opensolaris_string.c
+SRCS+= opensolaris_sysevent.c
SRCS+= opensolaris_taskq.c
SRCS+= opensolaris_uio.c
SRCS+= opensolaris_vfs.c
@@ -44,6 +46,7 @@ SRCS+= vnode.c
.PATH: ${SUNW}/uts/common/os
SRCS+= callb.c
+SRCS+= fm.c
SRCS+= list.c
SRCS+= nvpair_alloc_system.c
@@ -59,6 +62,9 @@ SRCS+= zmod.c
SRCS+= zmod_subr.c
SRCS+= zutil.c
+.PATH: ${.CURDIR}/../../crypto/sha2
+SRCS+= sha2.c
+
.PATH: ${SUNW}/common/zfs
.include "${SUNW}/uts/common/Makefile.files"
.PATH: ${SUNW}/uts/common/fs/zfs
@@ -79,6 +85,10 @@ CFLAGS+=-I${SUNW}/common
CFLAGS+=-I${.CURDIR}/../../../include
CFLAGS+=-DBUILDING_ZFS
+.if ${TARGET_ARCH} == "powerpc64"
+CFLAGS+=-mminimal-toc
+.endif
+
#CFLAGS+=-DDEBUG=1
#DEBUG_FLAGS=-g
diff --git a/usr.bin/fstat/zfs.c b/usr.bin/fstat/zfs.c
index 96cdff6870a0..cdca41ff1a75 100644
--- a/usr.bin/fstat/zfs.c
+++ b/usr.bin/fstat/zfs.c
@@ -43,6 +43,7 @@
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
#include <err.h>
#include <kvm.h>