aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c153
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c733
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c316
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c495
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c962
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c1152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c911
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c245
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c1196
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c1030
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c244
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c373
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c683
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2080
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c474
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c281
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c376
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c866
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c1766
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c1060
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c61
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c245
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c344
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c1970
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c123
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c2904
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c150
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c454
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h60
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h246
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h208
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h100
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h62
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h265
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h72
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h38
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h142
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h116
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h134
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h330
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h182
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c175
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c990
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c174
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c137
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c1546
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c244
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c171
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c539
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c1290
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c234
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c247
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c691
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c64
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c2916
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c178
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c252
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c147
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c334
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c863
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c2628
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1065
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c1583
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c993
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c140
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c98
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c181
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c86
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c194
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c1967
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/callb.c76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/fm.c1402
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h170
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/callb.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h27
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cred.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/debug.h23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h375
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/processor.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h42
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h30
163 files changed, 37402 insertions, 12531 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 2aaf5bcdaff9..2ab1d7b8ea9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -20,8 +20,8 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+#
#
# This Makefile defines all file modules for the directory uts/common
# and its children. These are the source files which may be considered
@@ -30,8 +30,12 @@
ZFS_COMMON_OBJS += \
arc.o \
bplist.o \
+ bpobj.o \
dbuf.o \
+ ddt.o \
+ ddt_zap.o \
dmu.o \
+ dmu_diff.o \
dmu_send.o \
dmu_object.o \
dmu_objset.o \
@@ -41,17 +45,18 @@ ZFS_COMMON_OBJS += \
dnode_sync.o \
dsl_dir.o \
dsl_dataset.o \
+ dsl_deadlist.o \
dsl_pool.o \
dsl_synctask.o \
dmu_zfetch.o \
dsl_deleg.o \
dsl_prop.o \
- dsl_scrub.o \
- fletcher.o \
+ dsl_scan.o \
gzip.o \
lzjb.o \
metaslab.o \
refcount.o \
+ sa.o \
sha256.o \
spa.o \
spa_config.o \
@@ -75,20 +80,25 @@ ZFS_COMMON_OBJS += \
zap_leaf.o \
zap_micro.o \
zfs_byteswap.o \
+ zfs_debug.o \
zfs_fm.o \
zfs_fuid.o \
+ zfs_sa.o \
zfs_znode.o \
zil.o \
zio.o \
zio_checksum.o \
zio_compress.o \
- zio_inject.o
+ zio_inject.o \
+ zle.o \
+ zrlock.o
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
zfs_deleg.o \
zfs_prop.o \
zfs_comutil.o \
+ zfs_fletcher.o \
zpool_prop.o \
zprop_common.o
@@ -99,7 +109,9 @@ ZFS_OBJS += \
zfs_ctldir.o \
zfs_dir.o \
zfs_ioctl.o \
+ zfs_ioctl_compat.o \
zfs_log.o \
+ zfs_onexit.o \
zfs_replay.o \
zfs_rlock.o \
rrwlock.o \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index 269c3ebe75d8..436918b35c13 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -40,7 +40,6 @@
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/cred.h>
-#include <sys/kdb.h>
#include <sys/gfs.h>
@@ -108,6 +107,42 @@
* gfs_root_create_file()
*/
+#ifdef sun
+/*
+ * gfs_make_opsvec: take an array of vnode type definitions and create
+ * their vnodeops_t structures
+ *
+ * This routine takes an array of gfs_opsvec_t's. It could
+ * alternatively take an array of gfs_opsvec_t*'s, which would allow
+ * vnode types to be completely defined in files external to the caller
+ * of gfs_make_opsvec(). As it stands, much more sharing takes place --
+ * both the caller and the vnode type provider need to access gfsv_ops
+ * and gfsv_template, and the caller also needs to know gfsv_name.
+ */
+int
+gfs_make_opsvec(gfs_opsvec_t *vec)
+{
+ int error, i;
+
+ for (i = 0; ; i++) {
+ if (vec[i].gfsv_name == NULL)
+ return (0);
+ error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
+ vec[i].gfsv_ops);
+ if (error)
+ break;
+ }
+
+ cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
+ vec[i].gfsv_name);
+ for (i--; i >= 0; i--) {
+ vn_freevnodeops(*vec[i].gfsv_ops);
+ *vec[i].gfsv_ops = NULL;
+ }
+ return (error);
+}
+#endif /* sun */
+
/*
* Low level directory routines
*
@@ -312,6 +347,22 @@ gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
cookies));
}
+#ifdef sun
+/*
+ * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
+ * instead of a string for the entry's name.
+ */
+int
+gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
+ ino64_t ino, unsigned long num)
+{
+ char buf[40];
+
+ numtos(num, buf);
+ return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
+}
+#endif
+
/*
* gfs_readdir_pred: readdir loop predicate
* voffp - a pointer in which the next virtual offset should be stored
@@ -542,6 +593,28 @@ gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
return (vp);
}
+#ifdef sun
+/*
+ * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
+ *
+ * Similar to gfs_root_create(), this creates a root vnode for a file to
+ * be the pseudo-filesystem.
+ */
+vnode_t *
+gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
+{
+ vnode_t *vp = gfs_file_create(size, NULL, ops);
+
+ ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
+
+ VFS_HOLD(vfsp);
+ VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
+ vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
+
+ return (vp);
+}
+#endif /* sun */
+
/*
* gfs_file_inactive()
*
@@ -570,7 +643,7 @@ gfs_file_inactive(vnode_t *vp)
*/
if ((dp = fp->gfs_parent->v_data) == NULL)
return (NULL);
-
+
/*
* First, see if this vnode is cached in the parent.
*/
@@ -995,6 +1068,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
return (gfs_readdir_fini(&gstate, error, eofp, eof));
}
+
/*
* gfs_vop_lookup: VOP_LOOKUP() entry point
*
@@ -1062,6 +1136,81 @@ gfs_vop_readdir(ap)
return (error);
}
+
+#ifdef sun
+/*
+ * gfs_vop_map: VOP_MAP() entry point
+ *
+ * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
+ * This function only works for readonly files, and uses the read function for
+ * the vnode to fill in the data. The mapped data is immediately faulted in and
+ * filled with the necessary data during this call; there are no getpage() or
+ * putpage() routines.
+ */
+/* ARGSUSED */
+int
+gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
+ caller_context_t *ct)
+{
+ int rv;
+ ssize_t resid = len;
+
+ /*
+ * Check for bad parameters
+ */
+#ifdef _ILP32
+ if (len > MAXOFF_T)
+ return (ENOMEM);
+#endif
+ if (vp->v_flag & VNOMAP)
+ return (ENOTSUP);
+ if (off > MAXOFF_T)
+ return (EFBIG);
+ if ((long)off < 0 || (long)(off + len) < 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+ if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
+ return (EACCES);
+
+ /*
+ * Find appropriate address if needed, otherwise clear address range.
+ */
+ as_rangelock(as);
+ rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (rv != 0) {
+ as_rangeunlock(as);
+ return (rv);
+ }
+
+ /*
+ * Create mapping
+ */
+ rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
+ as_rangeunlock(as);
+ if (rv != 0)
+ return (rv);
+
+ /*
+ * Fill with data from read()
+ */
+ rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
+ 0, (rlim64_t)0, cred, &resid);
+
+ if (rv == 0 && resid != 0)
+ rv = ENXIO;
+
+ if (rv != 0) {
+ as_rangelock(as);
+ (void) as_unmap(as, *addrp, len);
+ as_rangeunlock(as);
+ }
+
+ return (rv);
+}
+#endif /* sun */
+
/*
* gfs_vop_inactive: VOP_INACTIVE() entry point
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
index f4e2449f018f..83f29c154de9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -18,9 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -75,7 +75,6 @@ xva_getxoptattr(xvattr_t *xvap)
static void
vn_rele_inactive(vnode_t *vp)
{
-
vrele(vp);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 38b39bf021fb..2adad8ad726c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -119,16 +118,17 @@
#include <sys/spa.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <zfs_fletcher.h>
#include <sys/sdt.h>
#include <vm/vm_pageout.h>
@@ -178,7 +178,6 @@ static boolean_t arc_warm;
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
@@ -186,14 +185,11 @@ int zfs_arc_p_min_shift = 0;
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
SYSCTL_DECL(_vfs_zfs);
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
"Maximum ARC size");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
"Minimum ARC size");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
* Note that buffers can be in one of 6 states:
@@ -500,6 +496,7 @@ struct arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
+ void *b_thawed;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
@@ -560,7 +557,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
-#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -609,8 +605,8 @@ static buf_hash_table_t buf_hash_table;
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(buf) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
@@ -634,7 +630,7 @@ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
@@ -788,6 +784,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+}
+
static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
@@ -921,7 +926,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
arc_buf_t *buf = vbuf;
bzero(buf, sizeof (arc_buf_t));
- rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
@@ -937,6 +943,7 @@ hdr_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *buf = vbuf;
+ ASSERT(BUF_EMPTY(buf));
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
@@ -949,7 +956,8 @@ buf_dest(void *vbuf, void *unused)
{
arc_buf_t *buf = vbuf;
- rw_destroy(&buf->b_lock);
+ mutex_destroy(&buf->b_evict_lock);
+ rw_destroy(&buf->b_data_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
@@ -1077,18 +1085,31 @@ arc_buf_thaw(arc_buf_t *buf)
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_thawed)
+ kmem_free(buf->b_hdr->b_thawed, 1);
+ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
void
arc_buf_freeze(arc_buf_t *buf)
{
+ kmutex_t *hash_lock;
+
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
+ mutex_exit(hash_lock);
}
static void
@@ -1111,7 +1132,6 @@ get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lo
static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
-
ASSERT(MUTEX_HELD(hash_lock));
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
@@ -1185,6 +1205,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
from_delta = to_delta = ab->b_datacnt * ab->b_size;
@@ -1207,7 +1228,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
/*
* If prefetching out of the ghost cache,
- * we will have a non-null datacnt.
+ * we will have a non-zero datacnt.
*/
if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
/* ghost elements have a ghost size */
@@ -1245,9 +1266,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon) {
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
buf_hash_remove(ab);
- }
/* adjust state sizes */
if (to_delta)
@@ -1391,14 +1411,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_state == arc_anon);
ASSERT(buf->b_data != NULL);
- VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
- VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
@@ -1406,6 +1441,8 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
+ ASSERT(hdr->b_state != arc_anon);
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@@ -1430,16 +1467,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
* must verify b_data != NULL to know if the add_ref
* was successful.
*/
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
if (buf->b_data == NULL) {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return;
}
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
- rw_exit(&buf->b_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ mutex_exit(&buf->b_evict_lock);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
@@ -1487,6 +1524,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
+
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1524,6 +1562,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
+ buf->b_next = NULL;
ASSERT(buf->b_efunc == NULL);
@@ -1538,55 +1577,55 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!(hdr->b_flags & ARC_STORED));
+ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
- if (hdr->b_l2hdr != NULL) {
- if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
+ if (l2hdr != NULL) {
+ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ if (!buflist_held) {
mutex_enter(&l2arc_buflist_mtx);
- if (hdr->b_l2hdr != NULL) {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
- hdr);
- }
- mutex_exit(&l2arc_buflist_mtx);
- } else {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ l2hdr = hdr->b_l2hdr;
}
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
+
+ if (l2hdr != NULL) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!buflist_held)
+ mutex_exit(&l2arc_buflist_mtx);
}
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
}
while (hdr->b_buf) {
arc_buf_t *buf = hdr->b_buf;
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1596,6 +1635,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
+ if (hdr->b_thawed) {
+ kmem_free(hdr->b_thawed, 1);
+ hdr->b_thawed = NULL;
+ }
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1616,11 +1659,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1)
+ if (hdr->b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
- else
+ } else {
+ ASSERT(buf == hdr->b_buf);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
mutex_exit(hash_lock);
} else if (HDR_IO_IN_PROGRESS(hdr)) {
int destroy_hdr;
@@ -1637,12 +1686,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
- if (remove_reference(hdr, NULL, tag) > 0) {
- ASSERT(HDR_IO_ERROR(hdr));
+ if (remove_reference(hdr, NULL, tag) > 0)
arc_buf_destroy(buf, FALSE, TRUE);
- } else {
+ else
arc_hdr_destroy(hdr);
- }
}
}
@@ -1654,11 +1701,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
int no_callback = (buf->b_efunc == NULL);
if (hdr->b_state == arc_anon) {
+ ASSERT(hdr->b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT(hdr->b_state != arc_anon);
ASSERT(buf->b_data != NULL);
@@ -1668,6 +1718,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
}
ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1747,7 +1798,8 @@ evict_start:
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
- LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - ab->b_arc_access <
+ arc_min_prefetch_lifespan)) {
skipped++;
continue;
}
@@ -1762,7 +1814,7 @@ evict_start:
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
- if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
@@ -1784,9 +1836,9 @@ evict_start:
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
@@ -1887,6 +1939,7 @@ static void
arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t marker = { 0 };
list_t *list, *list_start;
kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
@@ -1913,7 +1966,15 @@ evict_start:
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
hash_lock = HDR_LOCK(ab);
+ /* caller may be trying to modify this buffer, skip it */
+ if (MUTEX_HELD(hash_lock))
+ continue;
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
@@ -1936,18 +1997,21 @@ evict_start:
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
- } else {
- if (bytes < 0) {
- /*
- * we're draining the ARC, retry
- */
- mutex_exit(lock);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto evict_start;
- }
+ } else if (bytes < 0) {
+ /*
+ * Insert a list marker and then wait for the
+ * hash lock to become available. Once its
+ * available, restart from where we left off.
+ */
+ list_insert_after(list, ab, &marker);
+ mutex_exit(lock);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ mutex_enter(lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ } else
bufs_skipped += 1;
- }
}
mutex_exit(lock);
idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
@@ -2056,9 +2120,9 @@ restart:
while (tmp_arc_eviction_list != NULL) {
arc_buf_t *buf = tmp_arc_eviction_list;
tmp_arc_eviction_list = buf->b_next;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
buf->b_hdr = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
if (buf->b_efunc != NULL)
VERIFY(buf->b_efunc(buf) == 0);
@@ -2148,11 +2212,9 @@ static int needfree = 0;
static int
arc_reclaim_needed(void)
{
-#if 0
- uint64_t extra;
-#endif
#ifdef _KERNEL
+
if (needfree)
return (1);
@@ -2163,7 +2225,7 @@ arc_reclaim_needed(void)
if (vm_paging_needed())
return (1);
-#if 0
+#ifdef sun
/*
* take 'desfree' extra pages, so we reclaim sooner, rather than later
*/
@@ -2205,10 +2267,10 @@ arc_reclaim_needed(void)
(btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
return (1);
#endif
-#else
+#else /* !sun */
if (kmem_used() > (kmem_size() * 3) / 4)
return (1);
-#endif
+#endif /* sun */
#else
if (spa_get_random(100) == 0)
@@ -2290,7 +2352,7 @@ arc_reclaim_thread(void *dummy __unused)
}
/* reset the growth delay for every reclaim */
- growtime = LBOLT + (arc_grow_retry * hz);
+ growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
/*
@@ -2304,7 +2366,7 @@ arc_reclaim_thread(void *dummy __unused)
arc_kmem_reap_now(last_reclaim);
arc_warm = B_TRUE;
- } else if (arc_no_grow && LBOLT >= growtime) {
+ } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
arc_no_grow = FALSE;
}
@@ -2411,7 +2473,7 @@ arc_evict_needed(arc_buf_contents_t type)
if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
return (1);
-#if 0
+#ifdef sun
#ifdef _KERNEL
/*
* If zio data pages are being allocated out of a separate heap segment,
@@ -2423,7 +2485,7 @@ arc_evict_needed(arc_buf_contents_t type)
(vmem_size(zio_arena, VMEM_ALLOC) >> 5))
return (1);
#endif
-#endif
+#endif /* sun */
if (arc_reclaim_needed())
return (1);
@@ -2543,6 +2605,8 @@ out:
static void
arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
{
+ clock_t now;
+
ASSERT(MUTEX_HELD(hash_lock));
if (buf->b_state == arc_anon) {
@@ -2553,11 +2617,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
*/
ASSERT(buf->b_arc_access == 0);
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
arc_change_state(arc_mru, buf, hash_lock);
} else if (buf->b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
/*
* If this buffer is here because of a prefetch, then either:
* - clear the flag if this is a "referencing" read
@@ -2573,7 +2639,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = now;
return;
}
@@ -2582,13 +2648,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
+ if (now > buf->b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
@@ -2611,7 +2677,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2630,7 +2696,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(list_link_active(&buf->b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
@@ -2648,7 +2714,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
new_state = arc_mru;
}
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
@@ -2658,7 +2724,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* This buffer is on the 2nd Level ARC.
*/
- buf->b_arc_access = LBOLT;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
} else {
@@ -2671,7 +2737,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ if (zio == NULL || zio->io_error == 0)
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
@@ -2685,6 +2752,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
*bufp = NULL;
} else {
*bufp = buf;
+ ASSERT(buf->b_data);
}
}
@@ -2732,6 +2800,16 @@ arc_read_done(zio_t *zio)
arc_cksum_compute(buf, B_FALSE);
+ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
/* create copies of the data buffer for the callers */
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2745,8 +2823,11 @@ arc_read_done(zio_t *zio)
hdr->b_acb = NULL;
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf)
+ if (abuf == buf) {
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(hdr->b_datacnt == 1);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
@@ -2767,14 +2848,6 @@ arc_read_done(zio_t *zio)
cv_broadcast(&hdr->b_cv);
if (hash_lock) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- if (zio->io_error == 0 && hdr->b_state == arc_anon)
- arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
/*
@@ -2825,27 +2898,37 @@ arc_read_done(zio_t *zio)
*
* Normal callers should use arc_read and pass the arc buffer and offset
* for the bp. But if you know you don't need locking, you can use
- * arc_read_bp.
+ * arc_read_nolock.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
+ if (pbuf == NULL) {
+ /*
+ * XXX This happens from traverse callback funcs, for
+ * the objset_phys_t block.
+ */
+ return (arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb));
+ }
+
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_lock, RW_READER);
+ rw_enter(&pbuf->b_data_lock, RW_READER);
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
- rw_exit(&pbuf->b_lock);
+ rw_exit(&pbuf->b_data_lock);
+
return (err);
}
int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
@@ -2856,7 +2939,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
uint64_t guid = spa_guid(spa);
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
if (hdr && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@@ -2910,6 +2994,7 @@ top:
} else {
buf = arc_buf_clone(buf);
}
+
} else if (*arc_flags & ARC_PREFETCH &&
refcount_count(&hdr->b_refcnt) == 0) {
hdr->b_flags |= ARC_PREFETCH;
@@ -2940,15 +3025,13 @@ top:
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = bp->blk_birth;
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
if (exists) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
(void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
@@ -2983,12 +3066,14 @@ top:
buf->b_private = NULL;
buf->b_next = NULL;
hdr->b_buf = buf;
- arc_get_data_buf(buf);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
-
+ arc_get_data_buf(buf);
+ arc_access(hdr, hash_lock);
}
+ ASSERT(!GHOST_STATE(hdr->b_state));
+
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
@@ -2997,17 +3082,6 @@ top:
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
- /*
- * If the buffer has been evicted, migrate it to a present state
- * before issuing the I/O. Once we drop the hash-table lock,
- * the header will be marked as I/O in progress and have an
- * attached buffer. At this point, anybody who finds this
- * buffer ought to notice that it's legit but has a pending I/O.
- */
-
- if (GHOST_STATE(hdr->b_state))
- arc_access(hdr, hash_lock);
-
if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -3023,8 +3097,8 @@ top:
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
- zbookmark_t *, zb);
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+ uint64_t, size, zbookmark_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -3110,47 +3184,15 @@ top:
return (0);
}
-/*
- * arc_read() variant to support pool traversal. If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_mtx;
- uint64_t guid = spa_guid(spa);
- int rc = 0;
-
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
- if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
-
- ASSERT(buf);
- while (buf->b_data == NULL) {
- buf = buf->b_next;
- ASSERT(buf);
- }
- bcopy(buf->b_data, data, hdr->b_size);
- } else {
- rc = ENOENT;
- }
-
- if (hash_mtx)
- mutex_exit(hash_mtx);
-
- return (rc);
-}
-
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
ASSERT(buf->b_hdr != NULL);
ASSERT(buf->b_hdr->b_state != arc_anon);
ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
buf->b_efunc = func;
buf->b_private = private;
}
@@ -3169,14 +3211,14 @@ arc_buf_evict(arc_buf_t *buf)
list_t *list, *evicted_list;
kmutex_t *lock, *evicted_lock;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (0);
} else if (buf->b_data == NULL) {
arc_buf_t copy = *buf; /* structure assignment */
@@ -3185,14 +3227,15 @@ arc_buf_evict(arc_buf_t *buf)
* but let arc_do_user_evicts() do the reaping.
*/
buf->b_efunc = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(copy.b_efunc(&copy) == 0);
return (1);
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -3211,6 +3254,7 @@ arc_buf_evict(arc_buf_t *buf)
arc_state_t *old_state = hdr->b_state;
arc_state_t *evicted_state;
+ ASSERT(hdr->b_buf == NULL);
ASSERT(refcount_is_zero(&hdr->b_refcnt));
evicted_state =
@@ -3230,12 +3274,13 @@ arc_buf_evict(arc_buf_t *buf)
mutex_exit(lock);
}
mutex_exit(hash_lock);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_hdr = NULL;
+ buf->b_next = NULL;
kmem_cache_free(buf_cache, buf);
return (1);
}
@@ -3250,29 +3295,30 @@ void
arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size;
- boolean_t released = B_FALSE;
- rw_enter(&buf->b_lock, RW_WRITER);
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
- ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
- ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
- arc_buf_thaw(buf);
- rw_exit(&buf->b_lock);
- released = B_TRUE;
} else {
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
}
l2hdr = hdr->b_l2hdr;
@@ -3282,9 +3328,6 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size;
}
- if (released)
- goto out;
-
/*
* Do we have more than one buf?
*/
@@ -3298,14 +3341,14 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
- * Pull the data off of this buf and attach it to
- * a new anonymous buf.
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
bufp = &hdr->b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
- *bufp = (*bufp)->b_next;
+ *bufp = buf->b_next;
buf->b_next = NULL;
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3333,26 +3376,25 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
- mutex_exit(hash_lock);
+ if (hash_lock)
+ mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
-out:
if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3361,14 +3403,27 @@ out:
}
}
+/*
+ * Release this buffer. If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb)
+{
+ arc_release(buf, tag);
+ return (0);
+}
+
int
arc_released(arc_buf_t *buf)
{
int released;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (released);
}
@@ -3377,9 +3432,9 @@ arc_has_callback(arc_buf_t *buf)
{
int callback;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
callback = (buf->b_efunc != NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (callback);
}
@@ -3389,9 +3444,9 @@ arc_referenced(arc_buf_t *buf)
{
int referenced;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
referenced = (refcount_count(&buf->b_hdr->b_refcnt));
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (referenced);
}
#endif
@@ -3431,21 +3486,28 @@ arc_write_done(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- hdr->b_acb = NULL;
+ ASSERT(hdr->b_acb == NULL);
+
+ if (zio->io_error == 0) {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ } else {
+ ASSERT(BUF_EMPTY(hdr));
+ }
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = zio->io_bp->blk_birth;
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
- * so there will be no dva/birth-date/checksum. The buffer
- * must therefor remain anonymous (and uncached).
+ * so there will be no dva/birth/checksum. The buffer must
+ * therefore remain anonymous (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
+ ASSERT(zio->io_error == 0);
+
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
@@ -3455,106 +3517,54 @@ arc_write_done(zio_t *zio)
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
- ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
- ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
- BP_IDENTITY(zio->io_bp)));
- ASSERT3U(zio->io_bp_orig.blk_birth, ==,
- zio->io_bp->blk_birth);
-
- ASSERT(refcount_is_zero(&exists->b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (hdr->b_state == arc_anon)
+ if (!exists && hdr->b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
- } else if (callback->awcb_done == NULL) {
- int destroy_hdr;
- /*
- * This is an anonymous buffer with no user callback,
- * destroy it if there are no active references.
- */
- mutex_enter(&arc_eviction_mtx);
- destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- hdr->b_flags &= ~ARC_STORED;
- if (callback->awcb_done) {
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
- }
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
}
-static void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
- boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
- /* Determine checksum setting */
- if (ismd) {
- /*
- * Metadata always gets checksummed. If the data
- * checksum is multi-bit correctable, and it's not a
- * ZBT-style checksum, then it's suitable for metadata
- * as well. Otherwise, the metadata checksum defaults
- * to fletcher4.
- */
- if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
- !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
- zp->zp_checksum = wp->wp_oschecksum;
- else
- zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
- wp->wp_oschecksum);
- }
-
- /* Determine compression setting */
- if (ismd) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
- } else {
- zp->zp_compress = zio_compress_select(wp->wp_dncompress,
- wp->wp_oscompress);
- }
-
- zp->zp_type = wp->wp_type;
- zp->zp_level = wp->wp_level;
- zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
- zio_prop_t zp;
ASSERT(ready != NULL);
+ ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == 0);
+ ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3563,103 +3573,27 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
callback->awcb_private = private;
callback->awcb_buf = buf;
- write_policy(spa, wp, &zp);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- arc_buf_hdr_t *ab;
- kmutex_t *hash_lock;
- zio_t *zio;
- uint64_t guid = spa_guid(spa);
-
- /*
- * If this buffer is in the cache, release it, so it
- * can be re-used.
- */
- ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (ab != NULL) {
- /*
- * The checksum of blocks to free is not always
- * preserved (eg. on the deadlist). However, if it is
- * nonzero, it should match what we have in the cache.
- */
- ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
- bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
- if (ab->b_state != arc_anon)
- arc_change_state(arc_anon, ab, hash_lock);
- if (HDR_IO_IN_PROGRESS(ab)) {
- /*
- * This should only happen when we prefetch.
- */
- ASSERT(ab->b_flags & ARC_PREFETCH);
- ASSERT3U(ab->b_datacnt, ==, 1);
- ab->b_flags |= ARC_FREED_IN_READ;
- if (HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- } else if (refcount_is_zero(&ab->b_refcnt)) {
- ab->b_flags |= ARC_FREE_IN_PROGRESS;
- mutex_exit(hash_lock);
- arc_hdr_destroy(ab);
- ARCSTAT_BUMP(arcstat_deleted);
- } else {
- /*
- * We still have an active reference on this
- * buffer. This can happen, e.g., from
- * dbuf_unoverride().
- */
- ASSERT(!HDR_IN_HASH_TABLE(ab));
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- }
- }
-
- zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
- if (arc_flags & ARC_WAIT)
- return (zio_wait(zio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(zio);
-
- return (0);
-}
-
static int
arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count
- + cnt.v_cache_count);
+ uint64_t available_memory =
+ ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
-#if 0
+#ifdef sun
#if defined(__i386)
available_memory =
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
-#endif
+#endif /* sun */
if (available_memory >= zfs_write_limit_max)
return (0);
@@ -3776,10 +3710,12 @@ arc_lowmem(void *arg __unused, int howto __unused)
/* Serialize access via arc_lowmem_lock. */
mutex_enter(&arc_lowmem_lock);
+ mutex_enter(&arc_reclaim_thr_lock);
needfree = 1;
cv_signal(&arc_reclaim_thr_cv);
while (needfree)
- tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
+ msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
+ mutex_exit(&arc_reclaim_thr_lock);
mutex_exit(&arc_lowmem_lock);
}
#endif
@@ -3787,8 +3723,7 @@ arc_lowmem(void *arg __unused, int howto __unused)
void
arc_init(void)
{
- int prefetch_tunable_set = 0;
- int i;
+ int i, prefetch_tunable_set = 0;
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
@@ -3799,7 +3734,8 @@ arc_init(void)
/* Start out with 1/8 of all memory */
arc_c = kmem_size() / 8;
-#if 0
+
+#ifdef sun
#ifdef _KERNEL
/*
* On architectures where the physical memory can be larger
@@ -3808,7 +3744,7 @@ arc_init(void)
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
#endif
-#endif
+#endif /* sun */
/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<18);
/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
@@ -3817,16 +3753,18 @@ arc_init(void)
else
arc_c_max = arc_c_min;
arc_c_max = MAX(arc_c * 5, arc_c_max);
+
#ifdef _KERNEL
/*
* Allow the tunables to override our calculations if they are
* reasonable (ie. over 16MB)
*/
- if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
+ if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
arc_c_max = zfs_arc_max;
- if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
+ if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
arc_c_min = zfs_arc_min;
#endif
+
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
@@ -3936,7 +3874,7 @@ arc_init(void)
"-- to enable,\n");
printf(" add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
- zfs_prefetch_disable=1;
+ zfs_prefetch_disable = 1;
}
#else
if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
@@ -3945,7 +3883,7 @@ arc_init(void)
"than 4GB of RAM is present;\n"
" to enable, add \"vfs.zfs.prefetch_disable=0\" "
"to /boot/loader.conf.\n");
- zfs_prefetch_disable=1;
+ zfs_prefetch_disable = 1;
}
#endif
/* Warn about ZFS memory and address space requirements. */
@@ -4199,7 +4137,7 @@ l2arc_write_size(l2arc_dev_t *dev)
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
- clock_t interval, next;
+ clock_t interval, next, now;
/*
* If the ARC lists are busy, increase our write rate; if the
@@ -4212,7 +4150,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
else
interval = hz * l2arc_feed_secs;
- next = MAX(LBOLT, MIN(LBOLT + interval, began + interval));
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
return (next);
}
@@ -4414,11 +4353,11 @@ l2arc_read_done(zio_t *zio)
ASSERT(cb != NULL);
buf = cb->l2rcb_buf;
ASSERT(buf != NULL);
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
* Check this survived the L2ARC journey.
@@ -4632,7 +4571,7 @@ top:
}
mutex_exit(&l2arc_buflist_mtx);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
dev->l2ad_evict = taddr;
}
@@ -4802,15 +4741,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- spa_l2cache_space_update(dev->l2ad_vdev, 0,
- dev->l2ad_end - dev->l2ad_hand);
+ vdev_space_update(dev->l2ad_vdev,
+ dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
@@ -4834,7 +4773,7 @@ l2arc_feed_thread(void *dummy __unused)
l2arc_dev_t *dev;
spa_t *spa;
uint64_t size, wrote;
- clock_t begin, next = LBOLT;
+ clock_t begin, next = ddi_get_lbolt();
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
@@ -4843,9 +4782,9 @@ l2arc_feed_thread(void *dummy __unused)
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- next - LBOLT);
+ next - ddi_get_lbolt());
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
- next = LBOLT + hz;
+ next = ddi_get_lbolt() + hz;
/*
* Quick check for L2ARC devices.
@@ -4856,7 +4795,7 @@ l2arc_feed_thread(void *dummy __unused)
continue;
}
mutex_exit(&l2arc_dev_mtx);
- begin = LBOLT;
+ begin = ddi_get_lbolt();
/*
* This selects the next l2arc device to write to, and in
@@ -4875,6 +4814,16 @@ l2arc_feed_thread(void *dummy __unused)
ASSERT(spa != NULL);
/*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
* Avoid contributing to memory pressure.
*/
if (arc_reclaim_needed()) {
@@ -4931,7 +4880,7 @@ l2arc_vdev_present(vdev_t *vd)
* validated the vdev and opened it.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
@@ -4945,8 +4894,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_vdev = vd;
adddev->l2ad_write = l2arc_write_max;
adddev->l2ad_boost = l2arc_write_boost;
- adddev->l2ad_start = start;
- adddev->l2ad_end = end;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
@@ -4961,7 +4910,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2node));
- spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
/*
* Add device to global list
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
index 93b7741d77be..066ccc6b1e05 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -19,331 +19,51 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/bplist.h>
#include <sys/zfs_context.h>
-static int
-bplist_hold(bplist_t *bpl)
-{
- ASSERT(MUTEX_HELD(&bpl->bpl_lock));
- if (bpl->bpl_dbuf == NULL) {
- int err = dmu_bonus_hold(bpl->bpl_mos,
- bpl->bpl_object, bpl, &bpl->bpl_dbuf);
- if (err)
- return (err);
- bpl->bpl_phys = bpl->bpl_dbuf->db_data;
- }
- return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
- int size;
-
- size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
- BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
- return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
- DMU_OT_BPLIST_HDR, size, tx));
-}
void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+bplist_create(bplist_t *bpl)
{
- VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
- dmu_object_info_t doi;
- int err;
-
- err = dmu_object_info(mos, object, &doi);
- if (err)
- return (err);
-
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_dbuf == NULL);
- ASSERT(bpl->bpl_phys == NULL);
- ASSERT(bpl->bpl_cached_dbuf == NULL);
- ASSERT(bpl->bpl_queue == NULL);
- ASSERT(object != 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
- bpl->bpl_mos = mos;
- bpl->bpl_object = object;
- bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
- bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
- bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
- mutex_exit(&bpl->bpl_lock);
- return (0);
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
}
void
-bplist_close(bplist_t *bpl)
-{
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_queue == NULL);
-
- if (bpl->bpl_cached_dbuf) {
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- bpl->bpl_cached_dbuf = NULL;
- }
- if (bpl->bpl_dbuf) {
- dmu_buf_rele(bpl->bpl_dbuf, bpl);
- bpl->bpl_dbuf = NULL;
- bpl->bpl_phys = NULL;
- }
-
- mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
- boolean_t rv;
-
- if (bpl->bpl_object == 0)
- return (B_TRUE);
-
- mutex_enter(&bpl->bpl_lock);
- VERIFY(0 == bplist_hold(bpl)); /* XXX */
- rv = (bpl->bpl_phys->bpl_entries == 0);
- mutex_exit(&bpl->bpl_lock);
-
- return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
- int err = 0;
-
- if (bpl->bpl_cached_dbuf == NULL ||
- bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
- if (bpl->bpl_cached_dbuf != NULL)
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- err = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blkid << bpl->bpl_blockshift,
- bpl, &bpl->bpl_cached_dbuf);
- ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
- 1ULL << bpl->bpl_blockshift);
- }
- return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- if (*itorp >= bpl->bpl_phys->bpl_entries) {
- mutex_exit(&bpl->bpl_lock);
- return (ENOENT);
- }
-
- blk = *itorp >> bpl->bpl_bpshift;
- off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- bparray = bpl->bpl_cached_dbuf->db_data;
- *bp = bparray[off];
- (*itorp)++;
- mutex_exit(&bpl->bpl_lock);
- return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+bplist_destroy(bplist_t *bpl)
{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err)
- return (err);
-
- blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
- off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
- bparray = bpl->bpl_cached_dbuf->db_data;
- bparray[off] = *bp;
-
- /* We never need the fill count. */
- bparray[off].blk_fill = 0;
-
- /* The bplist will compress better if we can leave off the checksum */
- bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- bpl->bpl_phys->bpl_entries++;
- bpl->bpl_phys->bpl_bytes +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
- bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
- }
- mutex_exit(&bpl->bpl_lock);
-
- return (0);
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
}
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
{
- bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
- ASSERT(!BP_IS_HOLE(bp));
mutex_enter(&bpl->bpl_lock);
- bpq->bpq_blk = *bp;
- bpq->bpq_next = bpl->bpl_queue;
- bpl->bpl_queue = bpq;
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
}
void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
{
- bplist_q_t *bpq;
+ bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpq = bpl->bpl_queue) != NULL) {
- bpl->bpl_queue = bpq->bpq_next;
+ while (bpe = list_head(&bpl->bpl_list)) {
+ list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
- VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
- kmem_free(bpq, sizeof (*bpq));
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
mutex_enter(&bpl->bpl_lock);
}
mutex_exit(&bpl->bpl_lock);
}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
- mutex_enter(&bpl->bpl_lock);
- ASSERT3P(bpl->bpl_queue, ==, NULL);
- VERIFY(0 == bplist_hold(bpl));
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- VERIFY(0 == dmu_free_range(bpl->bpl_mos,
- bpl->bpl_object, 0, -1ULL, tx));
- bpl->bpl_phys->bpl_entries = 0;
- bpl->bpl_phys->bpl_bytes = 0;
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp = 0;
- bpl->bpl_phys->bpl_uncomp = 0;
- }
- mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- *usedp = bpl->bpl_phys->bpl_bytes;
- if (bpl->bpl_havecomp) {
- *compp = bpl->bpl_phys->bpl_comp;
- *uncompp = bpl->bpl_phys->bpl_uncomp;
- }
- mutex_exit(&bpl->bpl_lock);
-
- if (!bpl->bpl_havecomp) {
- uint64_t itor = 0, comp = 0, uncomp = 0;
- blkptr_t bp;
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- comp += BP_GET_PSIZE(&bp);
- uncomp += BP_GET_UCSIZE(&bp);
- }
- if (err == ENOENT)
- err = 0;
- *compp = comp;
- *uncompp = uncomp;
- }
-
- return (err);
-}
-
-/*
- * Return (in *dasizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
- uint64_t *dasizep)
-{
- uint64_t size = 0;
- uint64_t itor = 0;
- blkptr_t bp;
- int err;
-
- /*
- * As an optimization, if they want the whole txg range, just
- * get bpl_bytes rather than iterating over the bps.
- */
- if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err == 0)
- *dasizep = bpl->bpl_phys->bpl_bytes;
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
- size +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
- }
- }
- if (err == ENOENT)
- err = 0;
- *dasizep = size;
- return (err);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
new file mode 100644
index 000000000000..72be31235607
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
@@ -0,0 +1,495 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+ boolean_t free)
+{
+ dmu_object_info_t doi;
+ int epb;
+ int64_t i;
+ int err = 0;
+ dmu_buf_t *dbuf = NULL;
+
+ mutex_enter(&bpo->bpo_lock);
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ blkptr_t *bparray;
+ blkptr_t *bp;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (blkptr_t);
+ blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+ FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ bparray = dbuf->db_data;
+ bp = &bparray[blkoff];
+ err = func(arg, bp, tx);
+ if (err)
+ break;
+ if (free) {
+ bpo->bpo_phys->bpo_bytes -=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ i++;
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+ i * sizeof (blkptr_t), -1ULL, tx));
+ }
+ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ ASSERT(bpo->bpo_havecomp);
+ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+ if (err) {
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+ }
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+ bpobj_t sublist;
+ uint64_t used_before, comp_before, uncomp_before;
+ uint64_t used_after, comp_after, uncomp_after;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+ if (err)
+ break;
+ if (free) {
+ err = bpobj_space(&sublist,
+ &used_before, &comp_before, &uncomp_before);
+ if (err)
+ break;
+ }
+ err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+ if (free) {
+ VERIFY3U(0, ==, bpobj_space(&sublist,
+ &used_after, &comp_after, &uncomp_after));
+ bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
+ bpo->bpo_phys->bpo_uncomp -=
+ uncomp_before - uncomp_after;
+ }
+
+ bpobj_close(&sublist);
+ if (err)
+ break;
+ if (free) {
+ err = dmu_object_free(bpo->bpo_os,
+ objarray[blkoff], tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_num_subobjs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ (i + 1) * sizeof (uint64_t), -1ULL, tx));
+ }
+
+out:
+ /* If there are no entries, there should be no bytes. */
+ ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+ bpo->bpo_phys->bpo_bytes == 0);
+
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp, subsubobjs;
+
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+ if (used == 0) {
+ /* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces
+ * recursion in bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset == doi.doi_data_block_size) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+ subsubobjs, tx));
+ }
+ }
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+
+ bpobj_close(&subbpo);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /* We never need the fill count. */
+ stored_bp.blk_fill = 0;
+
+ /* The bpobj will compress better if we can leave off the checksum */
+ if (!BP_GET_DEDUP(bp))
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ bpo->bpo_phys->bpo_bytes +=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index cf983e234df5..f6b2d99d3285 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -34,12 +33,12 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
/*
* Global data structures and functions for the dbuf cache.
@@ -107,7 +106,7 @@ dmu_buf_impl_t *
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
uint64_t obj = dn->dn_object;
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
@@ -138,7 +137,7 @@ static dmu_buf_impl_t *
dbuf_hash_insert(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = db->db_objset;
+ objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
uint64_t blkid = db->db_blkid;
@@ -218,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db)
db->db_evict_func = NULL;
}
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
void
dbuf_evict(dmu_buf_impl_t *db)
{
@@ -282,7 +297,8 @@ dbuf_fini(void)
static void
dbuf_verify(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db)
return;
ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if (dn == NULL) {
ASSERT(db->db_parent == NULL);
ASSERT(db->db_blkptr == NULL);
@@ -297,24 +315,35 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- list_head(&dn->dn_dbufs));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !list_is_empty(&dn->dn_dbufs));
}
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, 0);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
+ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
/*
* We can't assert that db_size matches dn_datablksz because it
* can be momentarily different when another thread is doing
* dnode_set_blksz().
*/
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
+ dr = db->db_data_pending;
/*
* It should only be modified in syncing context, so
* make sure we only have one copy of the data.
@@ -331,8 +360,9 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
} else {
/* db is pointed to by an indirect block */
int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
@@ -344,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
* have the struct_rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
@@ -352,7 +382,8 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
- db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && !dn->dn_free_txg) {
/*
* If the blkptr isn't set but they have nonzero data,
@@ -368,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -396,8 +428,35 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
} else {
dbuf_evict_user(db);
db->db.db_data = NULL;
- db->db_state = DB_UNCACHED;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ spa_t *spa;
+
+ mutex_exit(&db->db_mtx);
+ DB_GET_SPA(&spa, db);
+ abuf = arc_loan_buf(spa, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ dbuf_set_data(db, NULL);
+ mutex_exit(&db->db_mtx);
}
+ return (abuf);
}
uint64_t
@@ -436,24 +495,26 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, NULL);
+ dbuf_rele_and_unlock(db, NULL);
}
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
arc_buf_t *pbuf;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -461,7 +522,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size);
@@ -471,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -489,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
+ DB_DNODE_EXIT(db);
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -496,17 +559,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
+ spa = dn->dn_objset->os_spa;
+ DB_DNODE_EXIT(db);
+
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_L2CACHE;
- zb.zb_objset = db->db_objset->os_dsl_dataset ?
- db->db_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
+ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -516,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
else
pbuf = db->db_objset->os_phys_buf;
- (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+ (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -530,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
+ dnode_t *dn;
/*
* We don't have to hold the mutex to check db_state because it
@@ -537,46 +602,54 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
ASSERT(!refcount_is_zero(&db->db_holds));
+ if (db->db_state == DB_NOFILL)
+ return (EIO);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
- prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
- if (zio == NULL) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- }
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
dbuf_read_impl(db, zio, &flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
if (!havepzio)
err = zio_wait(zio);
} else {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -600,18 +673,21 @@ static void
dbuf_noread(dmu_buf_impl_t *db)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
+ DB_GET_SPA(&spa, db);
+ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_set_data(db, NULL);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
@@ -643,18 +719,18 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (dr == NULL ||
(dr->dt.dl.dr_data !=
- ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
return;
/*
* If the last dirty record for this dbuf has not yet synced
* and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
+ * reset the reference to point to a new copy,
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -662,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dr->dt.dl.dr_data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -674,22 +752,25 @@ void
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
ASSERT(db->db_level == 0);
- if (db->db_blkid == DB_BONUS_BLKID ||
+ if (db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
return;
+ ASSERT(db->db_data_pending != dr);
+
/* free this block */
- if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
- /* XXX can get silent EIO here */
- (void) dsl_free(NULL,
- spa_get_dsl(db->db_dnode->dn_objset->os_spa),
- txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
+ if (!BP_IS_HOLE(bp)) {
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
@@ -719,7 +800,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
uint64_t first_l1 = start >> epbs;
uint64_t last_l1 = end >> epbs;
- if (end > dn->dn_maxblkid) {
+ if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
end = dn->dn_maxblkid;
last_l1 = end >> epbs;
}
@@ -727,7 +808,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level == 1 &&
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
@@ -755,6 +836,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
db->db_state == DB_EVICTING) {
ASSERT(db->db.db_data == NULL);
mutex_exit(&db->db_mtx);
@@ -782,7 +864,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
* size to reflect that this buffer may
* contain new data when we sync.
*/
- if (db->db_blkid > dn->dn_maxblkid)
+ if (db->db_blkid != DMU_SPILL_BLKID &&
+ db->db_blkid > dn->dn_maxblkid)
dn->dn_maxblkid = db->db_blkid;
dbuf_unoverride(dr);
} else {
@@ -825,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
- /* If we don't exist or are in a snapshot, we can't be freed */
+ /*
+ * If we don't exist or are in a snapshot, we can't be freed.
+ * Don't pass the bp to dsl_dataset_block_freeable() since we
+ * are holding the db_mtx lock and might deadlock if we are
+ * prefetching a dedup-ed block.
+ */
if (birth_txg)
return (ds == NULL ||
- dsl_dataset_block_freeable(ds, birth_txg));
+ dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (FALSE);
}
@@ -839,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -858,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -878,14 +970,36 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);
- dnode_willuse_space(db->db_dnode, size-osize, tx);
+ dnode_willuse_space(dn, size-osize, tx);
+ DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os;
+ zbookmark_t zb;
+
+ DB_GET_OBJSET(&os, db);
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ zb.zb_objset = os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ (void) arc_release_bp(db->db_buf, db,
+ db->db_blkptr, os->os_spa, &zb);
}
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
@@ -895,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
DMU_TX_DIRTY_BUF(tx, db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
@@ -920,7 +1036,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* syncing context don't bother holding ahead.
*/
ASSERT(db->db_level != 0 ||
- db->db_state == DB_CACHED || db->db_state == DB_FILL);
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
mutex_enter(&dn->dn_mtx);
/*
@@ -936,6 +1053,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
mutex_exit(&dn->dn_mtx);
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
/*
* If this buffer is already dirty, we're done.
*/
@@ -945,13 +1065,16 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -979,18 +1102,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* we already dirtied it in open context. Hence we must make
* this assertion only if we're not already dirty.
*/
+ os = dn->dn_objset;
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* Update the accounting.
* Note: we delay "free accounting" until after we drop
* the db_mtx. This keeps us from grabbing other locks
- * (and possibly deadlocking) in bp_get_dasize() while
+ * (and possibly deadlocking) in bp_get_dsize() while
* also holding the db_mtx.
*/
dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1006,22 +1130,26 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level == 0) {
void *data_old = db->db_buf;
- if (db->db_blkid == DB_BONUS_BLKID) {
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db.db_data;
- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
- /*
- * Release the data buffer from the cache so that we
- * can modify it without impacting possible other users
- * of this cached data block. Note that indirect
- * blocks and private objects are not released until the
- * syncing state (since they are only modified then).
- */
- arc_release(db->db_buf, db);
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db_buf;
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
}
- ASSERT(data_old != NULL);
dr->dt.dl.dr_data = data_old;
} else {
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -1039,7 +1167,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* and dbuf_dirty. We win, as though the dbuf_noread() had
* happened after the free.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
@@ -1055,17 +1184,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
} else if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
- bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ bp_get_dsize(os->os_spa, bp) : db->db.db_size;
/*
* This is only a guess -- if the dbuf is dirty
* in a previous txg, we don't know how much
@@ -1074,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* db_blkptr, but since this is just a guess,
* it's OK if we get an odd answer.
*/
+ ddt_prefetch(os->os_spa, bp);
dnode_willuse_space(dn, -willfree, tx);
}
@@ -1097,6 +1229,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
parent = dbuf_hold_level(dn, db->db_level+1,
db->db_blkid >> epbs, FTAG);
+ ASSERT(parent != NULL);
parent_held = TRUE;
}
if (drop_struct_lock)
@@ -1121,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_level+1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL ||
- db->db_parent == db->db_dnode->dn_dbuf);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1132,21 +1264,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
}
static int
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
uint64_t txg = tx->tx_txg;
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
-
/*
* If this buffer is not dirty, we're done.
*/
@@ -1158,6 +1290,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (0);
}
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* If this buffer is currently held, we cannot undirty
@@ -1171,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1192,14 +1329,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
+ DB_DNODE_EXIT(db);
if (db->db_level == 0) {
- dbuf_unoverride(dr);
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ }
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -1214,7 +1355,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
arc_buf_t *buf = db->db_buf;
- ASSERT(arc_released(buf));
+ ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
dbuf_set_data(db, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
dbuf_evict(db);
@@ -1234,18 +1375,30 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
ASSERT(db->db_level == 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1267,7 +1420,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
@@ -1287,8 +1440,7 @@ void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
ASSERT(buf != NULL);
@@ -1311,9 +1463,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ xuio_stat_wbuf_copied();
return;
}
+ xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
@@ -1349,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
* in this case. For callers from the DMU we will usually see:
* dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
* For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
* Sometimes, though, we will get a mix of these two:
* DMU: dbuf_clear()->arc_buf_evict()
* ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1357,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
void
dbuf_clear(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ dmu_buf_impl_t *dndb;
int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1369,7 +1523,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
@@ -1377,16 +1531,32 @@ dbuf_clear(dmu_buf_impl_t *db)
db->db_state = DB_UNCACHED;
}
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
- if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
+ if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
+ } else {
+ DB_DNODE_EXIT(db);
}
if (db->db_buf)
@@ -1396,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
/*
- * If this dbuf is referened from an indirect dbuf,
+ * If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
if (parent && parent != dndb)
@@ -1412,7 +1582,20 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
*bpp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_have_spill &&
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ *bpp = &dn->dn_phys->dn_spill;
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
if (dn->dn_phys->dn_nlevels == 0)
nlevels = 1;
@@ -1461,7 +1644,7 @@ static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
dmu_buf_impl_t *parent, blkptr_t *blkptr)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1475,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkid = blkid;
db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
- db->db_dnode = dn;
+ db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
@@ -1485,16 +1668,20 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_immediate_evict = 0;
db->db_freed_in_flight = 0;
- if (blkid == DB_BONUS_BLKID) {
+ if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- db->db.db_offset = DB_BONUS_BLKID;
+ db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
} else {
int blocksize =
db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
@@ -1528,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
@@ -1562,20 +1750,29 @@ dbuf_destroy(dmu_buf_impl_t *db)
{
ASSERT(refcount_is_zero(&db->db_holds));
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (db->db_dnode) {
- dnode_t *dn = db->db_dnode;
+ if (db->db_dnode_handle != NULL) {
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
-
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold
+ * corresponding to the removed dbuf is no longer
+ * discounted in dnode_move(), so the dnode cannot be
+ * moved until after we release the hold.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
}
dbuf_hash_remove(db);
}
@@ -1598,7 +1795,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (dnode_block_freed(dn, blkid))
@@ -1606,37 +1803,34 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
/* dbuf_find() returns with db_mtx held */
if (db = dbuf_find(dn, 0, blkid)) {
- if (refcount_count(&db->db_holds) > 0) {
- /*
- * This dbuf is active. We assume that it is
- * already CACHED, or else about to be either
- * read or filled.
- */
- mutex_exit(&db->db_mtx);
- return;
- }
+ /*
+ * This dbuf is already in the cache. We assume that
+ * it is already CACHED, or else about to be either
+ * read or filled.
+ */
mutex_exit(&db->db_mtx);
- db = NULL;
+ return;
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
+ ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
arc_buf_t *pbuf;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
- dn->dn_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = dn->dn_object;
- zb.zb_level = 0;
- zb.zb_blkid = blkid;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, 0, blkid);
if (db)
pbuf = db->db_buf;
else
pbuf = dn->dn_objset->os_phys_buf;
- (void) arc_read(NULL, dn->dn_objset->os_spa,
- bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ (void) dsl_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, priority,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -1655,7 +1849,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
{
dmu_buf_impl_t *db, *parent = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);
@@ -1704,7 +1898,7 @@ top:
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
@@ -1713,7 +1907,7 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
@@ -1729,7 +1923,7 @@ top:
if (parent)
dbuf_rele(parent, NULL);
- ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;
@@ -1759,7 +1953,38 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (ENOTSUP);
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ if (blksz > SPA_MAXBLOCKSIZE)
+ blksz = SPA_MAXBLOCKSIZE;
+ else
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dbuf_new_size(db, blksz, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1770,15 +1995,38 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
ASSERT(holds > 1);
}
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
int64_t holds;
- mutex_enter(&db->db_mtx);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
@@ -1794,15 +2042,29 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_evict_user(db);
if (holds == 0) {
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
mutex_exit(&db->db_mtx);
- dnode_rele(db->db_dnode, db);
+
+ /*
+ * If the dnode moves here, we cannot cross this barrier
+ * until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+ DB_DNODE_EXIT(db);
+ /*
+ * The bonus buffer's dnode hold is no longer discounted
+ * in dnode_move(). The dnode cannot move until after
+ * the dnode_rele().
+ */
+ dnode_rele(DB_DNODE(db), db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
* dbuf with any data allocated from the ARC.
*/
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
dbuf_evict(db);
} else if (arc_released(db->db_buf)) {
arc_buf_t *buf = db->db_buf;
@@ -1892,7 +2154,7 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
if (db->db_blkptr)
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
- db->db_blkptr->blk_birth);
+ db->db_blkptr, db->db_blkptr->blk_birth);
return (res);
}
@@ -1906,6 +2168,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (db->db_blkptr != NULL)
return;
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = &dn->dn_phys->dn_spill;
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
/*
* This buffer was allocated at a time when there was
@@ -1941,7 +2208,7 @@ static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
zio_t *zio;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1959,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
}
ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT(db->db_buf != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
db->db_data_pending = dr;
@@ -1982,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2002,23 +2272,34 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
DBUF_VERIFY(db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
/*
* If this is a bonus buffer, simply copy the bonus data into the
* dnode. It will be written out when the dnode is synced (and it
* will be synced, since it must have been dirty for dbuf_sync to
* be called).
*/
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ DB_DNODE_EXIT(db);
+
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2028,6 +2309,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
while (*drp != dr)
drp = &(*drp)->dr_next;
ASSERT(dr->dr_next == NULL);
+ ASSERT(dr->dr_dbuf == db);
*drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
@@ -2036,11 +2318,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
return;
}
+ os = dn->dn_objset;
+
/*
* This function may have dropped the db_mtx lock allowing a dmu_sync
* operation to sneak in. As a result, we need to ensure that we
@@ -2050,7 +2333,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immdiate write,
+ * If this buffer is in the middle of an immediate write,
* wait for the synchronous IO to complete.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2059,43 +2342,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- /*
- * If this dbuf has already been written out via an immediate write,
- * just complete the write by copying over the new block pointer and
- * updating the accounting via the write-completion functions.
- */
- if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- zio_t zio_fake;
-
- zio_fake.io_private = &db;
- zio_fake.io_error = 0;
- zio_fake.io_bp = db->db_blkptr;
- zio_fake.io_bp_orig = *db->db_blkptr;
- zio_fake.io_txg = txg;
- zio_fake.io_flags = 0;
-
- *db->db_blkptr = dr->dt.dl.dr_overridden_by;
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- db->db_data_pending = dr;
- dr->dr_zio = &zio_fake;
- mutex_exit(&db->db_mtx);
-
- ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
- BP_IDENTITY(&zio_fake.io_bp_orig)) ||
- BP_IS_HOLE(zio_fake.io_bp));
-
- if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
- dbuf_write_ready(&zio_fake, db->db_buf, db);
- dbuf_write_done(&zio_fake, db->db_buf, db);
-
- return;
- }
-
- if (dn->dn_object != DMU_META_DNODE_OBJECT &&
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
*datap == db->db_buf) {
/*
* If this buffer is currently "in use" (i.e., there
@@ -2113,8 +2363,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
-
- ASSERT(*datap != NULL);
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
@@ -2122,10 +2370,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- else
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
+ }
}
void
@@ -2154,111 +2412,53 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
}
}
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- dmu_buf_impl_t *parent = db->db_parent;
- uint64_t txg = tx->tx_txg;
- zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
-
- if (!BP_IS_HOLE(db->db_blkptr) &&
- (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(data, db);
- } else {
- ASSERT(arc_released(data));
- /* XXX why do we need to thaw here? */
- arc_buf_thaw(data);
- }
-
- if (parent != dn->dn_dbuf) {
- ASSERT(parent && parent->db_data_pending);
- ASSERT(db->db_level == parent->db_level-1);
- ASSERT(arc_released(parent->db_buf));
- zio = parent->db_data_pending->dr_zio;
- } else {
- ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- zio = dn->dn_zio;
- }
-
- ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
- ASSERT(zio);
-
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
-
- wp.wp_type = dn->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dncompress = dn->dn_compress;
- wp.wp_oscompress = os->os_compress;
- wp.wp_dnchecksum = dn->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
-
- if (BP_IS_OLDER(db->db_blkptr, txg))
- (void) dsl_dataset_block_kill(
- os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
- dr->dr_zio = arc_write(zio, os->os_spa, &wp,
- DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
- data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-}
-
/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
+ dnode_t *dn;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
uint64_t fill = 0;
- int old_size, new_size, i;
+ int i;
ASSERT(db->db_blkptr == bp);
- dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
- old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, bp);
-
- dnode_diduse_space(dn, new_size - old_size);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- ASSERT3U(bp->blk_fill, ==, 0);
+ ASSERT(bp->blk_fill == 0);
+ DB_DNODE_EXIT(db);
return;
}
- ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
mutex_enter(&db->db_mtx);
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ }
+#endif
+
if (db->db_level == 0) {
mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID)
dn->dn_phys->dn_maxblkid = db->db_blkid;
mutex_exit(&dn->dn_mtx);
@@ -2281,21 +2481,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill += ibp->blk_fill;
}
}
+ DB_DNODE_EXIT(db);
bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
-
- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
- } else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- dsl_dataset_block_born(ds, bp, tx);
- }
}
/* ARGSUSED */
@@ -2303,34 +2493,70 @@ static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t txg = zio->io_txg;
dbuf_dirty_record_t **drp, *dr;
ASSERT3U(zio->io_error, ==, 0);
+ ASSERT(db->db_blkptr == bp);
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+
+ DB_GET_OBJSET(&os, db);
+ ds = os->os_dsl_dataset;
+ tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
drp = &db->db_last_dirty;
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ DB_DNODE_EXIT(db);
+ }
+#endif
+
if (db->db_level == 0) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ else if (!arc_released(db->db_buf))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
} else {
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2342,6 +2568,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
+ DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
@@ -2351,9 +2578,134 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
mutex_exit(&db->db_mtx);
- dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+ dbuf_write_done(zio, NULL, db);
+}
- dbuf_rele(db, (void *)(uintptr_t)txg);
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ int wp_flag = 0;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ dbuf_release_bp(db);
+ }
+ }
+ }
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ ASSERT(db->db_state != DB_NOFILL);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ dbuf_write_override_ready, dbuf_write_override_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, NULL, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+ dr->dr_zio = arc_write(zio, os->os_spa, txg,
+ db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
new file mode 100644
index 000000000000..0edf62e89c05
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
@@ -0,0 +1,1152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
+TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch);
+SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch,
+ 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+
+ /*
+ * Seed the cached statistics.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+ return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_data != NULL)
+ zio_buf_free(dde->dde_repair_data,
+ DDK_GET_PSIZE(&dde->dde_key));
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT)
+ break;
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ASSERT(error == 0 || error == ENOENT);
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+ for (int i = 0; i < DDT_KEY_WORDS; i++) {
+ if (u1[i] < u2[i])
+ return (-1);
+ if (u1[i] > u2[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object, tx) == 0);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ count += ddt_object_count(ddt, type, class);
+ }
+ }
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (count == 0 && ddt_object_exists(ddt, type, class))
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ zio_t *rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (ENOENT);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
new file mode 100644
index 000000000000..6812aa34cfb7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
@@ -0,0 +1,156 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t one, csize;
+ int error;
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ return (error);
+
+ ASSERT(one == 1);
+ ASSERT(csize <= sizeof (cbuf));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ return (error);
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+ return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+ uint64_t count = 0;
+
+ VERIFY(zap_count(os, object, &count) == 0);
+
+ return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_prefetch,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 26b4e5f5a855..56e284a6d610 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -40,7 +39,10 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/sa.h>
+#ifdef _KERNEL
#include <sys/zfs_znode.h>
+#endif
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -48,8 +50,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint64_array, TRUE, "object array" },
{ byteswap_uint8_array, TRUE, "packed nvlist" },
{ byteswap_uint64_array, TRUE, "packed nvlist size" },
- { byteswap_uint64_array, TRUE, "bplist" },
- { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "bpobj" },
+ { byteswap_uint64_array, TRUE, "bpobj header" },
{ byteswap_uint64_array, TRUE, "SPA space map header" },
{ byteswap_uint64_array, TRUE, "SPA space map" },
{ byteswap_uint64_array, TRUE, "ZIL intent log" },
@@ -81,21 +83,38 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "FUID table" },
{ byteswap_uint64_array, TRUE, "FUID table size" },
{ zap_byteswap, TRUE, "DSL dataset next clones"},
- { zap_byteswap, TRUE, "scrub work queue" },
+ { zap_byteswap, TRUE, "scan work queue" },
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
+ { zap_byteswap, TRUE, "snapshot refcount tags"},
+ { zap_byteswap, TRUE, "DDT ZAP algorithm" },
+ { zap_byteswap, TRUE, "DDT statistics" },
+ { byteswap_uint8_array, TRUE, "System attributes" },
+ { zap_byteswap, TRUE, "SA master node" },
+ { zap_byteswap, TRUE, "SA attr registration" },
+ { zap_byteswap, TRUE, "SA attr layouts" },
+ { zap_byteswap, TRUE, "scan translations" },
+ { byteswap_uint8_array, FALSE, "deduplicated block" },
+ { zap_byteswap, TRUE, "DSL deadlist map" },
+ { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" },
+ { zap_byteswap, TRUE, "DSL dir clones" },
+ { byteswap_uint64_array, TRUE, "bpobj subobj" },
};
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp)
+ void *tag, dmu_buf_t **dbp, int flags)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
blkid = dbuf_whichblock(dn, offset);
@@ -105,7 +124,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
if (db == NULL) {
err = EIO;
} else {
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ err = dbuf_read(db, NULL, db_flags);
if (err) {
dbuf_rele(db, tag);
db = NULL;
@@ -113,7 +132,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
}
dnode_rele(dn, FTAG);
- *dbp = &db->db;
+ *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -124,16 +143,79 @@ dmu_bonus_max(void)
}
int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
- if (newsize < 0 || newsize > db->db_size)
- return (EINVAL);
- dnode_setbonuslen(dn, newsize, tx);
- return (0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = EINVAL;
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (type > DMU_OT_NUMTYPES) {
+ error = EINVAL;
+ } else if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_rm_spill(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
}
/*
@@ -146,7 +228,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dmu_buf_impl_t *db;
int error;
- error = dnode_hold(os->os, object, FTAG, &dn);
+ error = dnode_hold(os, object, FTAG, &dn);
if (error)
return (error);
@@ -158,21 +240,105 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
/* as long as the bonus buf is held, the dnode will be held */
- if (refcount_add(&db->db_holds, tag) == 1)
+ if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
*dbp = &db->db;
return (0);
}
/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ ASSERT(db != NULL);
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else
+ dbuf_rele(db, tag);
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = EINVAL;
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = ENOENT;
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than <os, object> -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
@@ -278,7 +444,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -291,14 +457,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
}
int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
int err;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
return (err);
}
@@ -331,7 +501,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
if (len == 0) { /* they're interested in the bonus buffer */
- dn = os->os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
@@ -348,7 +518,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return;
@@ -480,7 +650,7 @@ dmu_free_long_range(objset_t *os, uint64_t object,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
@@ -495,7 +665,7 @@ dmu_free_object(objset_t *os, uint64_t object)
dmu_tx_t *tx;
int err;
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err != 0)
return (err);
@@ -523,7 +693,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
{
dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
+ int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
ASSERT(offset < UINT64_MAX);
@@ -541,7 +711,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_t **dbp;
int numbufs, err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -634,12 +804,157 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ dmu_buf_t *db = dbp[i];
+
+ dmu_buf_will_not_fill(db, tx);
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+ dmu_xuio_t *priv;
+ uio_t *uio = &xuio->xu_uio;
+
+ uio->uio_iovcnt = nblk;
+ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+ priv->cnt = nblk;
+ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+ priv->iovp = uio->uio_iov;
+ XUIO_XUZC_PRIV(xuio) = priv;
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+ return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int nblk = priv->cnt;
+
+ kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+ kmem_free(priv, sizeof (dmu_xuio_t));
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+ struct iovec *iov;
+ uio_t *uio = &xuio->xu_uio;
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int i = priv->next++;
+
+ ASSERT(i < priv->cnt);
+ ASSERT(off + n <= arc_buf_size(abuf));
+ iov = uio->uio_iov + i;
+ iov->iov_base = (char *)abuf->b_data + off;
+ iov->iov_len = n;
+ priv->bufs[i] = abuf;
+ return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (xuio_ksp != NULL) {
+ xuio_ksp->ks_data = &xuio_stats;
+ kstat_install(xuio_ksp);
+ }
+}
+
+static void
+xuio_stat_fini(void)
+{
+ if (xuio_ksp != NULL) {
+ kstat_delete(xuio_ksp);
+ xuio_ksp = NULL;
+ }
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
#ifdef _KERNEL
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
+ xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -650,6 +965,11 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
if (err)
return (err);
+#ifdef UIO_XUIO
+ if (uio->uio_extflg == UIO_XUIO)
+ xuio = (xuio_t *)uio;
+#endif
+
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
@@ -660,8 +980,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_READ, uio);
+ if (xuio) {
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ arc_buf_t *dbuf_abuf = dbi->db_buf;
+ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+ if (!err) {
+ uio->uio_resid -= tocpy;
+ uio->uio_loffset += tocpy;
+ }
+
+ if (abuf == dbuf_abuf)
+ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+ else
+ XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+ } else {
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ }
if (err)
break;
@@ -672,19 +1008,16 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
return (err);
}
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
- dmu_tx_t *tx)
+static int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
- int numbufs, i;
+ int numbufs;
int err = 0;
+ int i;
- if (size == 0)
- return (0);
-
- err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
- FALSE, FTAG, &numbufs, &dbp);
+ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
@@ -722,11 +1055,52 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
size -= tocpy;
}
+
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
-#ifndef __FreeBSD__
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+#ifdef sun
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
page_t *pp, dmu_tx_t *tx)
@@ -781,8 +1155,8 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
-#endif /* !__FreeBSD__ */
-#endif /* _KERNEL */
+#endif /* sun */
+#endif
/*
* Allocate a loaned anonymous arc buffer.
@@ -790,9 +1164,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ spa_t *spa;
- return (arc_loan_buf(dn->dn_objset->os_spa, size));
+ DB_GET_SPA(&spa, db);
+ return (arc_loan_buf(spa, size));
}
/*
@@ -814,78 +1190,147 @@ void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+ dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_size(buf);
uint64_t blkid;
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(dbuf);
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
+ objset_t *os;
+ uint64_t object;
+
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
+ os = dn->dn_objset;
+ object = dn->dn_object;
+ DB_DNODE_EXIT(dbuf);
+
dbuf_rele(db, FTAG);
- ASSERT(dn->dn_objset->os.os == dn->dn_objset);
- dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
- buf->b_data, tx);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
typedef struct {
- dbuf_dirty_record_t *dr;
- dmu_sync_cb_t *done;
- void *arg;
+ dbuf_dirty_record_t *dsa_dr;
+ dmu_sync_cb_t *dsa_done;
+ zgd_t *dsa_zgd;
+ dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
+ dmu_sync_arg_t *dsa = varg;
+ dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- if (!BP_IS_HOLE(bp)) {
- ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
- ASSERT(BP_GET_LEVEL(bp) == 0);
- bp->blk_fill = 1;
- } else {
- /*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to replay
- */
- BP_SET_LSIZE(bp, db->db.db_size);
+ if (zio->io_error == 0) {
+ if (BP_IS_HOLE(bp)) {
+ /*
+ * A block of zeros may compress to a hole, but the
+ * block size still needs to be known for replay.
+ */
+ BP_SET_LSIZE(bp, db->db_size);
+ } else {
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
}
}
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
+ dmu_sync_arg_t *dsa = varg;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dmu_sync_cb_t *done = in->done;
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
- dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
- if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
- BP_ZERO(&dr->dt.dl.dr_overridden_by);
- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ if (zio->io_error == 0) {
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+ BP_ZERO(&dr->dt.dl.dr_overridden_by);
+ } else {
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ }
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
- if (done)
- done(&(db->db), in->arg);
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *dsa = zio->io_private;
- kmem_free(in, sizeof (dmu_sync_arg_t));
+ if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
+
+ dmu_tx_commit(dsa->dsa_tx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+ zio_prop_t *zp, zbookmark_t *zb)
+{
+ dmu_sync_arg_t *dsa;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+ if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+ dmu_tx_abort(tx);
+ return (EIO); /* Make zl_get_data do txg_waited_synced() */
+ }
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = NULL;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = tx;
+
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+ return (0);
}
/*
@@ -904,157 +1349,112 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
- * EINPROGRESS: the IO has been initiated.
- * The caller should log this blkptr in the callback.
+ * EIO: could not do the I/O.
+ * The caller should do a txg_wait_synced().
*
- * 0: completed. Sets *bp to the blkptr just written.
- * The caller should log this blkptr immediately.
+ * 0: the I/O has been initiated.
+ * The caller should log this blkptr in the done callback.
+ * It is possible that the I/O will fail, in which case
+ * the error will be reported to the done callback and
+ * propagated to pio from zio_done().
*/
int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
- blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- objset_impl_t *os = db->db_objset;
- dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
- tx_state_t *tx = &dp->dp_tx;
+ blkptr_t *bp = zgd->zgd_bp;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
- dmu_sync_arg_t *in;
+ dmu_sync_arg_t *dsa;
zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
- int err;
+ zio_prop_t zp;
+ dnode_t *dn;
+ ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
- dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
- txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
/*
- * XXX - would be nice if we could do this without suspending...
+ * If we're frozen (running ziltest), we always need to generate a bp.
*/
- txg_suspend(dp);
+ if (txg > spa_freeze_txg(os->os_spa))
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
/*
- * If this txg already synced, there's nothing to do.
+ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+ * and us. If we determine that this txg is not yet syncing,
+ * but it begins to sync a moment later, that's OK because the
+ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
- if (txg <= tx->tx_synced_txg) {
- txg_resume(dp);
+ mutex_enter(&db->db_mtx);
+
+ if (txg <= spa_last_synced_txg(os->os_spa)) {
/*
- * If we're running ziltest, we need the blkptr regardless.
+ * This txg has already synced. There's nothing to do.
*/
- if (txg > spa_freeze_txg(dp->dp_spa)) {
- /* if db_blkptr == NULL, this was an empty write */
- if (db->db_blkptr)
- *bp = *db->db_blkptr; /* structure assignment */
- return (0);
- }
+ mutex_exit(&db->db_mtx);
return (EEXIST);
}
- mutex_enter(&db->db_mtx);
-
- if (txg == tx->tx_syncing_txg) {
- while (db->db_data_pending) {
- /*
- * IO is in-progress. Wait for it to finish.
- * XXX - would be nice to be able to somehow "attach"
- * this zio to the parent zio passed in.
- */
- cv_wait(&db->db_changed, &db->db_mtx);
- if (!db->db_data_pending &&
- db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
- /*
- * IO was compressed away
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
- ASSERT(db->db_data_pending ||
- (db->db_blkptr && db->db_blkptr->blk_birth == txg));
- }
-
- if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
- /*
- * IO is already completed.
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
+ if (txg <= spa_syncing_txg(os->os_spa)) {
+ /*
+ * This txg is currently syncing, so we can't mess with
+ * the dirty record anymore; just write a new log block.
+ */
+ mutex_exit(&db->db_mtx);
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
}
dr = db->db_last_dirty;
- while (dr && dr->dr_txg > txg)
+ while (dr && dr->dr_txg != txg)
dr = dr->dr_next;
- if (dr == NULL || dr->dr_txg < txg) {
+
+ if (dr == NULL) {
/*
- * This dbuf isn't dirty, must have been free_range'd.
+ * There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
mutex_exit(&db->db_mtx);
- txg_resume(dp);
return (ENOENT);
}
ASSERT(dr->dr_txg == txg);
- if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
- * We have already issued a sync write for this buffer.
- */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (EALREADY);
- } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- /*
- * This buffer has already been synced. It could not
+ * We have already issued a sync write for this buffer,
+ * or this buffer has already been synced. It could not
* have been dirtied since, or we would have cleared the state.
*/
- *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
+ return (EALREADY);
}
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
- in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
- in->dr = dr;
- in->done = done;
- in->arg = arg;
mutex_exit(&db->db_mtx);
- txg_resume(dp);
- zb.zb_objset = os->os_dsl_dataset->ds_object;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = NULL;
- wp.wp_type = db->db_dnode->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dnchecksum = db->db_dnode->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_dncompress = db->db_dnode->dn_compress;
- wp.wp_oscompress = os->os_compress;
+ zio_nowait(arc_write(pio, os->os_spa, txg,
+ bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dmu_sync_ready, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
- ASSERT(BP_IS_HOLE(bp));
-
- zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
- txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-
- if (pio) {
- zio_nowait(zio);
- err = EINPROGRESS;
- } else {
- err = zio_wait(zio);
- ASSERT(err == 0);
- }
- return (err);
+ return (0);
}
int
@@ -1064,7 +1464,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dnode_t *dn;
int err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_blksz(dn, size, ibs, tx);
@@ -1079,7 +1479,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
+ (void) dnode_hold(os, object, FTAG, &dn);
ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
@@ -1093,20 +1493,103 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
+ (void) dnode_hold(os, object, FTAG, &dn);
ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
+int zfs_mdcomp_disable = 0;
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+ boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+ (wp & WP_SPILL));
+ enum zio_checksum checksum = os->os_checksum;
+ enum zio_compress compress = os->os_compress;
+ enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+ boolean_t dedup;
+ boolean_t dedup_verify = os->os_dedup_verify;
+ int copies = os->os_copies;
+
+ /*
+ * Determine checksum setting.
+ */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[checksum].ci_correctable < 1 ||
+ zio_checksum_table[checksum].ci_eck)
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum, checksum);
+ }
+
+ /*
+ * Determine compression setting.
+ */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ compress = zio_compress_select(dn->dn_compress, compress);
+ }
+
+ /*
+ * Determine dedup setting. If we are in dmu_sync(), we won't
+ * actually dedup now because that's all done in syncing context;
+ * but we do want to use the dedup checkum. If the checksum is not
+ * strong enough to ensure unique signatures, force dedup_verify.
+ */
+ dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+ if (dedup) {
+ checksum = dedup_checksum;
+ if (!zio_checksum_table[checksum].ci_dedup)
+ dedup_verify = 1;
+ }
+
+ if (wp & WP_DMU_SYNC)
+ dedup = 0;
+
+ if (wp & WP_NOFILL) {
+ ASSERT(!ismd && level == 0);
+ checksum = ZIO_CHECKSUM_OFF;
+ compress = ZIO_COMPRESS_OFF;
+ dedup = B_FALSE;
+ }
+
+ zp->zp_checksum = checksum;
+ zp->zp_compress = compress;
+ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+ zp->zp_level = level;
+ zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+ zp->zp_dedup = dedup;
+ zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int i, err;
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
/*
@@ -1120,7 +1603,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
if (i != TXG_SIZE) {
dnode_rele(dn, FTAG);
txg_wait_synced(dmu_objset_pool(os), 0);
- err = dnode_hold(os->os, object, FTAG, &dn);
+ err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
}
@@ -1134,21 +1617,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
+ dnode_phys_t *dnp;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
+ dnp = dn->dn_phys;
+
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_bonus_size = dn->dn_bonuslen;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
- doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- doi->doi_type = dn->dn_type;
- doi->doi_bonus_size = dn->dn_bonuslen;
- doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
@@ -1162,7 +1651,7 @@ int
dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
{
dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
+ int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
@@ -1178,9 +1667,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
- dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
}
/*
@@ -1188,14 +1681,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
* This is specifically optimized for zfs_getattr().
*/
void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add 1 for dnode space */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + 1;
+ DB_DNODE_EXIT(db);
}
void
@@ -1246,8 +1745,12 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
- dbuf_init();
+ zfs_dbgmsg_init();
+ sa_cache_init();
+ xuio_stat_init();
+ dmu_objset_init();
dnode_init();
+ dbuf_init();
zfetch_init();
arc_init();
l2arc_init();
@@ -1256,9 +1759,13 @@ dmu_init(void)
void
dmu_fini(void)
{
+ l2arc_fini();
arc_fini();
zfetch_fini();
- dnode_fini();
dbuf_fini();
- l2arc_fini();
+ dnode_fini();
+ dmu_objset_fini();
+ xuio_stat_fini();
+ sa_cache_fini();
+ zfs_dbgmsg_fini();
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
new file mode 100644
index 000000000000..c72a28ba0cde
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+ struct file *da_fp; /* file to which we are reporting */
+ offset_t *da_offp;
+ int da_err; /* error that stopped diff search */
+ dmu_diff_record_t da_ddr;
+ kthread_t *da_td;
+};
+
+static int
+write_bytes(struct diffarg *da)
+{
+ struct uio auio;
+ struct iovec aiov;
+
+ aiov.iov_base = (caddr_t)&da->da_ddr;
+ aiov.iov_len = sizeof (da->da_ddr);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = da->da_td;
+#ifdef _KERNEL
+ if (da->da_fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ return (EOPNOTSUPP);
+#endif
+}
+
+static int
+write_record(struct diffarg *da)
+{
+
+ if (da->da_ddr.ddr_type == DDR_NONE) {
+ da->da_err = 0;
+ return (0);
+ }
+
+ da->da_err = write_bytes(da);
+ *da->da_offp += sizeof (da->da_ddr);
+ return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+ ASSERT(first <= last);
+ if (da->da_ddr.ddr_type != DDR_FREE ||
+ first != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_FREE;
+ da->da_ddr.ddr_first = first;
+ da->da_ddr.ddr_last = last;
+ return (0);
+ }
+ da->da_ddr.ddr_last = last;
+ return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+ ASSERT(dnp != NULL);
+ if (dnp->dn_type == DMU_OT_NONE)
+ return (report_free_dnode_range(da, object, object));
+
+ if (da->da_ddr.ddr_type != DDR_INUSE ||
+ object != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_INUSE;
+ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+ return (0);
+ }
+ da->da_ddr.ddr_last = object;
+ return (0);
+}
+
+#define DBP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct diffarg *da = arg;
+ int err = 0;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (EINTR);
+
+ if (zb->zb_object != DMU_META_DNODE_OBJECT)
+ return (0);
+
+ if (bp == NULL) {
+ uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+ err = report_free_dnode_range(da, dnobj,
+ dnobj + (span >> DNODE_SHIFT) - 1);
+ if (err)
+ return (err);
+ } else if (zb->zb_level == 0) {
+ dnode_phys_t *blk;
+ arc_buf_t *abuf;
+ uint32_t aflags = ARC_WAIT;
+ int blksz = BP_GET_LSIZE(bp);
+ int i;
+
+ if (dsl_read(NULL, spa, bp, pbuf,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ blk = abuf->b_data;
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = report_dnode(da, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ if (err)
+ return (err);
+ /* Don't care about the data blocks */
+ return (TRAVERSE_VISIT_NO_CHILDREN);
+ }
+ return (0);
+}
+
+int
+dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp, offset_t *offp)
+{
+ struct diffarg da;
+ dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
+ dsl_dataset_t *findds;
+ dsl_dataset_t *relds;
+ int err = 0;
+
+ /* make certain we are looking at snapshots */
+ if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+ return (EINVAL);
+
+ /* fromsnap must be earlier and from the same lineage as tosnap */
+ if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
+ return (EXDEV);
+
+ relds = NULL;
+ findds = ds;
+
+ while (fromds->ds_dir != findds->ds_dir) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (!dsl_dir_is_clone(findds->ds_dir)) {
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+ return (EXDEV);
+ }
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp,
+ findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ if (err)
+ return (EXDEV);
+
+ relds = findds;
+ }
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ da.da_fp = fp;
+ da.da_offp = offp;
+ da.da_ddr.ddr_type = DDR_NONE;
+ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+ da.da_err = 0;
+ da.da_td = curthread;
+
+ err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+ if (err) {
+ da.da_err = err;
+ } else {
+ /* we set the da.da_err we return as side-effect */
+ (void) write_record(&da);
+ }
+
+ return (da.da_err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 1f91fc1ad36f..8dff46048902 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -32,16 +31,15 @@ uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- objset_impl_t *osi = os->os;
uint64_t object;
uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
- (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
int restarted = B_FALSE;
- mutex_enter(&osi->os_obj_lock);
+ mutex_enter(&os->os_obj_lock);
for (;;) {
- object = osi->os_obj_next;
+ object = os->os_obj_next;
/*
* Each time we polish off an L2 bp worth of dnodes
* (2^13 objects), move to another L2 bp that's still
@@ -51,14 +49,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
*/
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
- int error = dnode_next_offset(osi->os_meta_dnode,
+ int error = dnode_next_offset(DMU_META_DNODE(os),
DNODE_FIND_HOLE,
&offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
if (error == 0)
object = offset >> DNODE_SHIFT;
}
- osi->os_obj_next = ++object;
+ os->os_obj_next = ++object;
/*
* XXX We should check for an i/o error here and return
@@ -66,19 +64,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
FTAG, &dn);
if (dn)
break;
if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- osi->os_obj_next = object - 1;
+ os->os_obj_next = object - 1;
}
dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
dnode_rele(dn, FTAG);
- mutex_exit(&osi->os_obj_lock);
+ mutex_exit(&os->os_obj_lock);
dmu_tx_add_new_object(tx, os, object);
return (object);
@@ -94,7 +92,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (EBADF);
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
if (err)
return (err);
dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
@@ -116,7 +114,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (object == DMU_META_DNODE_OBJECT)
return (EBADF);
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err)
return (err);
@@ -128,7 +126,11 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
return (0);
}
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (bonustype == DMU_OT_SA) {
+ nblkptr = 1;
+ } else {
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ }
/*
* If we are losing blkptrs or changing the block size this must
@@ -166,7 +168,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err)
return (err);
@@ -185,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
int error;
- error = dnode_next_offset(os->os->os_meta_dnode,
+ error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 2678b839fda7..09d13db717ee 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/cred.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
@@ -36,22 +37,41 @@
#include <sys/dbuf.h>
#include <sys/zvol.h>
#include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
spa_t *
dmu_objset_spa(objset_t *os)
{
- return (os->os->os_spa);
+ return (os->os_spa);
}
zilog_t *
dmu_objset_zil(objset_t *os)
{
- return (os->os->os_zil);
+ return (os->os_zil);
}
dsl_pool_t *
@@ -59,82 +79,112 @@ dmu_objset_pool(objset_t *os)
{
dsl_dataset_t *ds;
- if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
return (ds->ds_dir->dd_pool);
else
- return (spa_get_dsl(os->os->os_spa));
+ return (spa_get_dsl(os->os_spa));
}
dsl_dataset_t *
dmu_objset_ds(objset_t *os)
{
- return (os->os->os_dsl_dataset);
+ return (os->os_dsl_dataset);
}
dmu_objset_type_t
dmu_objset_type(objset_t *os)
{
- return (os->os->os_phys->os_type);
+ return (os->os_phys->os_type);
}
void
dmu_objset_name(objset_t *os, char *buf)
{
- dsl_dataset_name(os->os->os_dsl_dataset, buf);
+ dsl_dataset_name(os->os_dsl_dataset, buf);
}
uint64_t
dmu_objset_id(objset_t *os)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_syncprop(objset_t *os)
+{
+ return (os->os_sync);
+}
+
+uint64_t
+dmu_objset_logbias(objset_t *os)
+{
+ return (os->os_logbias);
+}
+
static void
checksum_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance should have been done by now.
*/
ASSERT(newval != ZIO_CHECKSUM_INHERIT);
- osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+ os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
}
static void
compression_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval != ZIO_COMPRESS_INHERIT);
- osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+ os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
}
static void
copies_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval > 0);
- ASSERT(newval <= spa_max_replication(osi->os_spa));
+ ASSERT(newval <= spa_max_replication(os->os_spa));
- osi->os_copies = newval;
+ os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+ spa_t *spa = os->os_spa;
+ enum zio_checksum checksum;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
}
static void
primary_cache_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
@@ -142,13 +192,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval)
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
- osi->os_primary_cache = newval;
+ os->os_primary_cache = newval;
}
static void
secondary_cache_changed_cb(void *arg, uint64_t newval)
{
- objset_impl_t *osi = arg;
+ objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
@@ -156,7 +206,35 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
newval == ZFS_CACHE_METADATA);
- osi->os_secondary_cache = newval;
+ os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+ newval == ZFS_SYNC_DISABLED);
+
+ os->os_sync = newval;
+ if (os->os_zil)
+ zil_set_sync(os->os_zil, newval);
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+ newval == ZFS_LOGBIAS_THROUGHPUT);
+ os->os_logbias = newval;
+ if (os->os_zil)
+ zil_set_logbias(os->os_zil, newval);
}
void
@@ -177,39 +255,37 @@ dmu_objset_byteswap(void *buf, size_t size)
int
dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
- objset_impl_t **osip)
+ objset_t **osp)
{
- objset_impl_t *osi;
+ objset_t *os;
int i, err;
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
- osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
- osi->os.os = osi;
- osi->os_dsl_dataset = ds;
- osi->os_spa = spa;
- osi->os_rootbp = bp;
- if (!BP_IS_HOLE(osi->os_rootbp)) {
+ os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+ os->os_dsl_dataset = ds;
+ os->os_spa = spa;
+ os->os_rootbp = bp;
+ if (!BP_IS_HOLE(os->os_rootbp)) {
uint32_t aflags = ARC_WAIT;
zbookmark_t zb;
- zb.zb_objset = ds ? ds->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
- if (DMU_OS_IS_L2CACHEABLE(osi))
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_L2CACHE;
- dprintf_bp(osi->os_rootbp, "reading %s", "");
+ dprintf_bp(os->os_rootbp, "reading %s", "");
/*
- * NB: when bprewrite scrub can change the bp,
+ * XXX when bprewrite scrub can change the bp,
* and this is called from dmu_objset_open_ds_os, the bp
* could change, and we'll need a lock.
*/
- err = arc_read_nolock(NULL, spa, osi->os_rootbp,
- arc_getbuf_func, &osi->os_phys_buf,
+ err = dsl_read_nolock(NULL, spa, os->os_rootbp,
+ arc_getbuf_func, &os->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
- kmem_free(osi, sizeof (objset_impl_t));
+ kmem_free(os, sizeof (objset_t));
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = EIO;
@@ -218,27 +294,27 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
- arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
+ arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_buf_alloc(spa,
- sizeof (objset_phys_t), &osi->os_phys_buf,
+ sizeof (objset_phys_t), &os->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
- bcopy(osi->os_phys_buf->b_data, buf->b_data,
- arc_buf_size(osi->os_phys_buf));
- (void) arc_buf_remove_ref(osi->os_phys_buf,
- &osi->os_phys_buf);
- osi->os_phys_buf = buf;
+ bcopy(os->os_phys_buf->b_data, buf->b_data,
+ arc_buf_size(os->os_phys_buf));
+ (void) arc_buf_remove_ref(os->os_phys_buf,
+ &os->os_phys_buf);
+ os->os_phys_buf = buf;
}
- osi->os_phys = osi->os_phys_buf->b_data;
- osi->os_flags = osi->os_phys->os_flags;
+ os->os_phys = os->os_phys_buf->b_data;
+ os->os_flags = os->os_phys->os_flags;
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
- osi->os_phys_buf = arc_buf_alloc(spa, size,
- &osi->os_phys_buf, ARC_BUFC_METADATA);
- osi->os_phys = osi->os_phys_buf->b_data;
- bzero(osi->os_phys, size);
+ os->os_phys_buf = arc_buf_alloc(spa, size,
+ &os->os_phys_buf, ARC_BUFC_METADATA);
+ os->os_phys = os->os_phys_buf->b_data;
+ bzero(os->os_phys, size);
}
/*
@@ -249,61 +325,78 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
*/
if (ds) {
err = dsl_prop_register(ds, "primarycache",
- primary_cache_changed_cb, osi);
+ primary_cache_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "secondarycache",
- secondary_cache_changed_cb, osi);
+ secondary_cache_changed_cb, os);
if (!dsl_dataset_is_snapshot(ds)) {
if (err == 0)
err = dsl_prop_register(ds, "checksum",
- checksum_changed_cb, osi);
+ checksum_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
+ compression_changed_cb, os);
if (err == 0)
err = dsl_prop_register(ds, "copies",
- copies_changed_cb, osi);
+ copies_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "dedup",
+ dedup_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "logbias",
+ logbias_changed_cb, os);
+ if (err == 0)
+ err = dsl_prop_register(ds, "sync",
+ sync_changed_cb, os);
}
if (err) {
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
- &osi->os_phys_buf) == 1);
- kmem_free(osi, sizeof (objset_impl_t));
+ VERIFY(arc_buf_remove_ref(os->os_phys_buf,
+ &os->os_phys_buf) == 1);
+ kmem_free(os, sizeof (objset_t));
return (err);
}
} else if (ds == NULL) {
/* It's the meta-objset. */
- osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_compress = ZIO_COMPRESS_LZJB;
- osi->os_copies = spa_max_replication(spa);
- osi->os_primary_cache = ZFS_CACHE_ALL;
- osi->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ os->os_compress = ZIO_COMPRESS_LZJB;
+ os->os_copies = spa_max_replication(spa);
+ os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+ os->os_dedup_verify = 0;
+ os->os_logbias = 0;
+ os->os_sync = 0;
+ os->os_primary_cache = ZFS_CACHE_ALL;
+ os->os_secondary_cache = ZFS_CACHE_ALL;
}
- osi->os_zil_header = osi->os_phys->os_zil_header;
- osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
+ if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+ os->os_zil_header = os->os_phys->os_zil_header;
+ os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
- list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[i]));
- list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[i]));
}
- list_create(&osi->os_dnodes, sizeof (dnode_t),
+ list_create(&os->os_dnodes, sizeof (dnode_t),
offsetof(dnode_t, dn_link));
- list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
- mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
-
- osi->os_meta_dnode = dnode_special_open(osi,
- &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
- if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
- osi->os_userused_dnode = dnode_special_open(osi,
- &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
- osi->os_groupused_dnode = dnode_special_open(osi,
- &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+ mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ DMU_META_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+ &os->os_meta_dnode);
+ if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
+ DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+ &os->os_userused_dnode);
+ DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+ &os->os_groupused_dnode);
}
/*
@@ -311,117 +404,96 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* have ds_opening_lock
*/
if (ds) {
- VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
- dmu_objset_evict));
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
}
- *osip = osi;
+ *osp = os;
return (0);
}
-static int
-dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
{
- objset_impl_t *osi;
+ int err = 0;
mutex_enter(&ds->ds_opening_lock);
- osi = dsl_dataset_get_user_ptr(ds);
- if (osi == NULL) {
- int err;
-
+ *osp = ds->ds_objset;
+ if (*osp == NULL) {
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, &osi);
- if (err) {
- mutex_exit(&ds->ds_opening_lock);
- return (err);
- }
+ ds, dsl_dataset_get_blkptr(ds), osp);
}
mutex_exit(&ds->ds_opening_lock);
-
- os->os = osi;
- os->os_mode = DS_MODE_NOHOLD;
-
- if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
- return (EINVAL);
- return (0);
+ return (err);
}
+/* called from zpl */
int
-dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
{
- objset_t *os;
+ dsl_dataset_t *ds;
int err;
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- err = dmu_objset_open_ds_os(ds, os, type);
+ err = dsl_dataset_hold(name, tag, &ds);
if (err)
- kmem_free(os, sizeof (objset_t));
- else
- *osp = os;
+ return (err);
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err)
+ dsl_dataset_rele(ds, tag);
+
return (err);
}
/* called from zpl */
int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp)
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
{
- objset_t *os;
dsl_dataset_t *ds;
int err;
- ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
- DS_MODE_TYPE(mode) == DS_MODE_OWNER);
-
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- if (DS_MODE_TYPE(mode) == DS_MODE_USER)
- err = dsl_dataset_hold(name, os, &ds);
- else
- err = dsl_dataset_own(name, mode, os, &ds);
- if (err) {
- kmem_free(os, sizeof (objset_t));
+ err = dsl_dataset_own(name, B_FALSE, tag, &ds);
+ if (err)
return (err);
- }
- err = dmu_objset_open_ds_os(ds, os, type);
+ err = dmu_objset_from_ds(ds, osp);
if (err) {
- if (DS_MODE_TYPE(mode) == DS_MODE_USER)
- dsl_dataset_rele(ds, os);
- else
- dsl_dataset_disown(ds, os);
- kmem_free(os, sizeof (objset_t));
- } else {
- os->os_mode = mode;
- *osp = os;
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dmu_objset_disown(*osp, tag);
+ return (EINVAL);
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dmu_objset_disown(*osp, tag);
+ return (EROFS);
}
return (err);
}
void
-dmu_objset_close(objset_t *os)
+dmu_objset_rele(objset_t *os, void *tag)
{
- ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
- DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
- DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+ dsl_dataset_rele(os->os_dsl_dataset, tag);
+}
- if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
- dsl_dataset_rele(os->os->os_dsl_dataset, os);
- else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
- dsl_dataset_disown(os->os->os_dsl_dataset, os);
- kmem_free(os, sizeof (objset_t));
+void
+dmu_objset_disown(objset_t *os, void *tag)
+{
+ dsl_dataset_disown(os->os_dsl_dataset, tag);
}
int
dmu_objset_evict_dbufs(objset_t *os)
{
- objset_impl_t *osi = os->os;
dnode_t *dn;
- mutex_enter(&osi->os_lock);
+ mutex_enter(&os->os_lock);
/* process the mdn last, since the other dnodes have holds on it */
- list_remove(&osi->os_dnodes, osi->os_meta_dnode);
- list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+ list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+ list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
/*
* Find the first dnode with holds. We have to do this dance
@@ -429,93 +501,114 @@ dmu_objset_evict_dbufs(objset_t *os)
* hold. If there are no holds then it has no dbufs so OK to
* skip.
*/
- for (dn = list_head(&osi->os_dnodes);
+ for (dn = list_head(&os->os_dnodes);
dn && !dnode_add_ref(dn, FTAG);
- dn = list_next(&osi->os_dnodes, dn))
+ dn = list_next(&os->os_dnodes, dn))
continue;
while (dn) {
dnode_t *next_dn = dn;
do {
- next_dn = list_next(&osi->os_dnodes, next_dn);
+ next_dn = list_next(&os->os_dnodes, next_dn);
} while (next_dn && !dnode_add_ref(next_dn, FTAG));
- mutex_exit(&osi->os_lock);
+ mutex_exit(&os->os_lock);
dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
- mutex_enter(&osi->os_lock);
+ mutex_enter(&os->os_lock);
dn = next_dn;
}
- mutex_exit(&osi->os_lock);
- return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
+ dn = list_head(&os->os_dnodes);
+ mutex_exit(&os->os_lock);
+ return (dn != DMU_META_DNODE(os));
}
void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+dmu_objset_evict(objset_t *os)
{
- objset_impl_t *osi = arg;
- objset_t os;
- int i;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
- ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
- }
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
if (!dsl_dataset_is_snapshot(ds)) {
VERIFY(0 == dsl_prop_unregister(ds, "checksum",
- checksum_changed_cb, osi));
+ checksum_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "compression",
- compression_changed_cb, osi));
+ compression_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "copies",
- copies_changed_cb, osi));
+ copies_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+ dedup_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+ logbias_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "sync",
+ sync_changed_cb, os));
}
VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
- primary_cache_changed_cb, osi));
+ primary_cache_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
- secondary_cache_changed_cb, osi));
+ secondary_cache_changed_cb, os));
}
+ if (os->os_sa)
+ sa_tear_down(os);
+
/*
* We should need only a single pass over the dnode list, since
* nothing can be added to the list at this point.
*/
- os.os = osi;
- (void) dmu_objset_evict_dbufs(&os);
+ (void) dmu_objset_evict_dbufs(os);
- dnode_special_close(osi->os_meta_dnode);
- if (osi->os_userused_dnode) {
- dnode_special_close(osi->os_userused_dnode);
- dnode_special_close(osi->os_groupused_dnode);
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
}
- zil_free(osi->os_zil);
+ zil_free(os->os_zil);
+
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+ VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
- ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
+ mutex_destroy(&os->os_lock);
+ mutex_destroy(&os->os_obj_lock);
+ mutex_destroy(&os->os_user_ptr_lock);
+ kmem_free(os, sizeof (objset_t));
+}
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
- mutex_destroy(&osi->os_lock);
- mutex_destroy(&osi->os_obj_lock);
- mutex_destroy(&osi->os_user_ptr_lock);
- kmem_free(osi, sizeof (objset_impl_t));
+timestruc_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+ return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
}
/* called from dsl for meta-objset */
-objset_impl_t *
+objset_t *
dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, dmu_tx_t *tx)
{
- objset_impl_t *osi;
+ objset_t *os;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
- if (ds)
- mutex_enter(&ds->ds_opening_lock);
- VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
- if (ds)
- mutex_exit(&ds->ds_opening_lock);
- mdn = osi->os_meta_dnode;
+ if (ds != NULL)
+ VERIFY(0 == dmu_objset_from_ds(ds, &os));
+ else
+ VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+
+ mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -550,24 +643,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_NONE);
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
- osi->os_phys->os_type = type;
- if (dmu_objset_userused_enabled(osi)) {
- osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
- osi->os_flags = osi->os_phys->os_flags;
+ os->os_phys->os_type = type;
+ if (dmu_objset_userused_enabled(os)) {
+ os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_flags = os->os_phys->os_flags;
}
dsl_dataset_dirty(ds, tx);
- return (osi);
+ return (os);
}
struct oscarg {
void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
void *userarg;
- dsl_dataset_t *clone_parent;
+ dsl_dataset_t *clone_origin;
const char *lastname;
dmu_objset_type_t type;
uint64_t flags;
+ cred_t *cr;
};
/*ARGSUSED*/
@@ -585,17 +679,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (err != ENOENT)
return (err ? err : EEXIST);
- if (oa->clone_parent != NULL) {
- /*
- * You can't clone across pools.
- */
- if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+ if (oa->clone_origin != NULL) {
+ /* You can't clone across pools. */
+ if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
return (EXDEV);
- /*
- * You can only clone snapshots, not the head datasets.
- */
- if (oa->clone_parent->ds_phys->ds_num_children == 0)
+ /* You can only clone snapshots, not the head datasets. */
+ if (!dsl_dataset_is_snapshot(oa->clone_origin))
return (EINVAL);
}
@@ -603,41 +693,40 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
+ spa_t *spa = dd->dd_pool->dp_spa;
struct oscarg *oa = arg2;
- dsl_dataset_t *ds;
- blkptr_t *bp;
- uint64_t dsobj;
+ uint64_t obj;
ASSERT(dmu_tx_is_syncing(tx));
- dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_parent, oa->flags, cr, tx);
+ obj = dsl_dataset_create_sync(dd, oa->lastname,
+ oa->clone_origin, oa->flags, oa->cr, tx);
+
+ if (oa->clone_origin == NULL) {
+ dsl_pool_t *dp = dd->dd_pool;
+ dsl_dataset_t *ds;
+ blkptr_t *bp;
+ objset_t *os;
- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
- bp = dsl_dataset_get_blkptr(ds);
- if (BP_IS_HOLE(bp)) {
- objset_impl_t *osi;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+ bp = dsl_dataset_get_blkptr(ds);
+ ASSERT(BP_IS_HOLE(bp));
- /* This is an empty dmu_objset; not a clone. */
- osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, bp, oa->type, tx);
+ os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
if (oa->userfunc)
- oa->userfunc(&osi->os, oa->userarg, cr, tx);
+ oa->userfunc(os, oa->userarg, oa->cr, tx);
+ dsl_dataset_rele(ds, FTAG);
}
- spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dsobj);
-
- dsl_dataset_rele(ds, FTAG);
+ spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj);
}
int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
{
dsl_dir_t *pdd;
@@ -654,24 +743,13 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
return (EEXIST);
}
- dprintf("name=%s\n", name);
-
oa.userfunc = func;
oa.userarg = arg;
oa.lastname = tail;
oa.type = type;
oa.flags = flags;
+ oa.cr = CRED();
- if (clone_parent != NULL) {
- /*
- * You can't clone to a different type.
- */
- if (clone_parent->os->os_phys->os_type != type) {
- dsl_dir_close(pdd, FTAG);
- return (EINVAL);
- }
- oa.clone_parent = clone_parent->os->os_dsl_dataset;
- }
err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
dmu_objset_create_sync, pdd, &oa, 5);
dsl_dir_close(pdd, FTAG);
@@ -679,67 +757,59 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
}
int
-dmu_objset_destroy(const char *name)
+dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
{
- objset_t *os;
- int error;
-
- /*
- * If it looks like we'll be able to destroy it, and there's
- * an unplayed replay log sitting around, destroy the log.
- * It would be nicer to do this in dsl_dataset_destroy_sync(),
- * but the replay log objset is modified in open context.
- */
- error = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
- if (error == 0) {
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- zil_destroy(dmu_objset_zil(os), B_FALSE);
+ dsl_dir_t *pdd;
+ const char *tail;
+ int err = 0;
+ struct oscarg oa = { 0 };
- error = dsl_dataset_destroy(ds, os);
- /*
- * dsl_dataset_destroy() closes the ds.
- */
- kmem_free(os, sizeof (objset_t));
+ ASSERT(strchr(name, '@') == NULL);
+ err = dsl_dir_open(name, FTAG, &pdd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ dsl_dir_close(pdd, FTAG);
+ return (EEXIST);
}
- return (error);
+ oa.lastname = tail;
+ oa.clone_origin = clone_origin;
+ oa.flags = flags;
+ oa.cr = CRED();
+
+ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+ dmu_objset_create_sync, pdd, &oa, 5);
+ dsl_dir_close(pdd, FTAG);
+ return (err);
}
-/*
- * This will close the objset.
- */
int
-dmu_objset_rollback(objset_t *os)
+dmu_objset_destroy(const char *name, boolean_t defer)
{
- int err;
dsl_dataset_t *ds;
+ int error;
- ds = os->os->os_dsl_dataset;
-
- if (!dsl_dataset_tryown(ds, TRUE, os)) {
- dmu_objset_close(os);
- return (EBUSY);
+ error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_dataset_destroy(ds, FTAG, defer);
+ /* dsl_dataset_destroy() closes the ds. */
}
- err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
-
- /*
- * NB: we close the objset manually because the rollback
- * actually implicitly called dmu_objset_evict(), thus freeing
- * the objset_impl_t.
- */
- dsl_dataset_disown(ds, os);
- kmem_free(os, sizeof (objset_t));
- return (err);
+ return (error);
}
struct snaparg {
dsl_sync_task_group_t *dstg;
char *snapname;
+ char *htag;
char failed[MAXPATHLEN];
- boolean_t checkperms;
+ boolean_t recursive;
+ boolean_t needsuspend;
+ boolean_t temporary;
nvlist_t *props;
+ struct dsl_ds_holdarg *ha; /* only needed in the temporary case */
+ dsl_dataset_t *newds;
};
static int
@@ -747,77 +817,137 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
struct snaparg *sn = arg2;
+ int error;
/* The props have already been checked by zfs_check_userprops(). */
- return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
- sn->snapname, tx));
+ error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
+ sn->snapname, tx);
+ if (error)
+ return (error);
+
+ if (sn->temporary) {
+ /*
+ * Ideally we would just call
+ * dsl_dataset_user_hold_check() and
+ * dsl_dataset_destroy_check() here. However the
+ * dataset we want to hold and destroy is the snapshot
+ * that we just confirmed we can create, but it won't
+ * exist until after these checks are run. Do any
+ * checks we can here and if more checks are added to
+ * those routines in the future, similar checks may be
+ * necessary here.
+ */
+ if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ /*
+ * Not checking number of tags because the tag will be
+ * unique, as it will be the only tag.
+ */
+ if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ return (E2BIG);
+
+ sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ sn->ha->temphold = B_TRUE;
+ sn->ha->htag = sn->htag;
+ }
+ return (error);
}
static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
struct snaparg *sn = arg2;
- dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+ dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
+
+ if (sn->props) {
+ dsl_props_arg_t pa;
+ pa.pa_props = sn->props;
+ pa.pa_source = ZPROP_SRC_LOCAL;
+ dsl_props_set_sync(ds->ds_prev, &pa, tx);
+ }
+
+ if (sn->temporary) {
+ struct dsl_ds_destroyarg da;
+
+ dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx);
+ kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg));
+ sn->ha = NULL;
+ sn->newds = ds->ds_prev;
- if (sn->props)
- dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
+ da.ds = ds->ds_prev;
+ da.defer = B_TRUE;
+ dsl_dataset_destroy_sync(&da, FTAG, tx);
+ }
}
static int
-dmu_objset_snapshot_one(char *name, void *arg)
+dmu_objset_snapshot_one(const char *name, void *arg)
{
struct snaparg *sn = arg;
objset_t *os;
int err;
+ char *cp;
+
+ /*
+ * If the objset starts with a '%', then ignore it unless it was
+ * explicitly named (ie, not recursive). These hidden datasets
+ * are always inconsistent, and by not opening them here, we can
+ * avoid a race with dsl_dir_destroy_check().
+ */
+ cp = strrchr(name, '/');
+ if (cp && cp[1] == '%' && sn->recursive)
+ return (0);
(void) strcpy(sn->failed, name);
/*
- * Check permissions only when requested. This only applies when
- * doing a recursive snapshot. The permission checks for the starting
- * dataset have already been performed in zfs_secpolicy_snapshot()
+ * Check permissions if we are doing a recursive snapshot. The
+ * permission checks for the starting dataset have already been
+ * performed in zfs_secpolicy_snapshot()
*/
- if (sn->checkperms == B_TRUE &&
- (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+ if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
return (err);
- err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
+ err = dmu_objset_hold(name, sn, &os);
if (err != 0)
return (err);
- /* If the objset is in an inconsistent state, return busy */
- if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
- dmu_objset_close(os);
- return (EBUSY);
- }
-
/*
- * NB: we need to wait for all in-flight changes to get to disk,
- * so that we snapshot those changes. zil_suspend does this as
- * a side effect.
+ * If the objset is in an inconsistent state (eg, in the process
+ * of being destroyed), don't snapshot it. As with %hidden
+ * datasets, we return EBUSY if this name was explicitly
+ * requested (ie, not recursive), and otherwise ignore it.
*/
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0) {
- dsl_sync_task_create(sn->dstg, snapshot_check,
- snapshot_sync, os, sn, 3);
- } else {
- dmu_objset_close(os);
+ if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
+ dmu_objset_rele(os, sn);
+ return (sn->recursive ? 0 : EBUSY);
}
- return (err);
+ if (sn->needsuspend) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err) {
+ dmu_objset_rele(os, sn);
+ return (err);
+ }
+ }
+ dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync,
+ os, sn, 3);
+
+ return (0);
}
int
-dmu_objset_snapshot(char *fsname, char *snapname,
- nvlist_t *props, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd)
{
dsl_sync_task_t *dst;
struct snaparg sn;
spa_t *spa;
+ minor_t minor;
int err;
(void) strcpy(sn.failed, fsname);
@@ -826,16 +956,31 @@ dmu_objset_snapshot(char *fsname, char *snapname,
if (err)
return (err);
+ if (temporary) {
+ if (cleanup_fd < 0) {
+ spa_close(spa, FTAG);
+ return (EINVAL);
+ }
+ if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
+ spa_close(spa, FTAG);
+ return (err);
+ }
+ }
+
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
+ sn.htag = tag;
sn.props = props;
+ sn.recursive = recursive;
+ sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ sn.temporary = temporary;
+ sn.ha = NULL;
+ sn.newds = NULL;
if (recursive) {
- sn.checkperms = B_TRUE;
err = dmu_objset_find(fsname,
dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
} else {
- sn.checkperms = B_FALSE;
err = dmu_objset_snapshot_one(fsname, &sn);
}
@@ -845,15 +990,33 @@ dmu_objset_snapshot(char *fsname, char *snapname,
for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
objset_t *os = dst->dst_arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- if (dst->dst_err)
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ if (dst->dst_err) {
dsl_dataset_name(ds, sn.failed);
- zil_resume(dmu_objset_zil(os));
- dmu_objset_close(os);
+ } else if (temporary) {
+ dsl_register_onexit_hold_cleanup(sn.newds, tag, minor);
+ }
+ if (sn.needsuspend)
+ zil_resume(dmu_objset_zil(os));
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) {
+ char name[MAXNAMELEN];
+
+ dmu_objset_name(os, name);
+ strlcat(name, "@", sizeof(name));
+ strlcat(name, snapname, sizeof(name));
+ zvol_create_minors(name);
+ }
+#endif
+#endif
+ dmu_objset_rele(os, &sn);
}
if (err)
(void) strcpy(fsname, sn.failed);
+ if (temporary)
+ zfs_onexit_fd_rele(cleanup_fd);
dsl_sync_task_group_destroy(sn.dstg);
spa_close(spa, FTAG);
return (err);
@@ -888,11 +1051,10 @@ dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
/* ARGSUSED */
static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
blkptr_t *bp = zio->io_bp;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- objset_impl_t *os = arg;
+ objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(bp == os->os_rootbp);
@@ -908,24 +1070,34 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, zio, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
/* called from dsl */
void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_t zb;
- writeprops_t wp = { 0 };
+ zio_prop_t zp;
zio_t *zio;
list_t *list;
list_t *newlist = NULL;
@@ -949,42 +1121,33 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
/*
* Create the root block IO
*/
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1; /* for block ordering; it's level 0 on disk */
- zb.zb_blkid = 0;
-
- wp.wp_type = DMU_OT_OBJSET;
- wp.wp_level = 0; /* on-disk BP level; see above */
- wp.wp_copies = os->os_copies;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_oscompress = os->os_compress;
-
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- os->os_rootbp, pio, tx);
- }
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf,
+ os->os_rootbp, os->os_spa, &zb));
- arc_release(os->os_phys_buf, &os->os_phys_buf);
+ dmu_write_policy(os, NULL, 0, 0, &zp);
- zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+ zio = arc_write(pio, os->os_spa, tx->tx_txg,
+ os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+ dmu_objset_write_ready, dmu_objset_write_done, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync special dnodes - the parent IO for the sync is the root block
*/
- os->os_meta_dnode->dn_zio = zio;
- dnode_sync(os->os_meta_dnode, tx);
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
os->os_phys->os_flags = os->os_flags;
- if (os->os_userused_dnode &&
- os->os_userused_dnode->dn_type != DMU_OT_NONE) {
- os->os_userused_dnode->dn_zio = zio;
- dnode_sync(os->os_userused_dnode, tx);
- os->os_groupused_dnode->dn_zio = zio;
- dnode_sync(os->os_groupused_dnode, tx);
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
txgoff = tx->tx_txg & TXG_MASK;
@@ -1002,7 +1165,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
- list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
ASSERT(dr->dr_dbuf->db_level == 0);
list_remove(list, dr);
@@ -1017,6 +1180,22 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
zio_nowait(zio);
}
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+ return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+ !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
+boolean_t
+dmu_objset_is_dirty_anywhere(objset_t *os)
+{
+ for (int t = 0; t < TXG_SIZE; t++)
+ if (dmu_objset_is_dirty(os, t))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
@@ -1026,74 +1205,86 @@ dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
}
boolean_t
-dmu_objset_userused_enabled(objset_impl_t *os)
+dmu_objset_userused_enabled(objset_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
- used_cbs[os->os_phys->os_type] &&
- os->os_userused_dnode);
+ used_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
+}
+
+static void
+do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
+ uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+{
+ if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+ int64_t delta = DNODE_SIZE + used;
+ if (subtract)
+ delta = -delta;
+ VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
+ user, delta, tx));
+ VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+ group, delta, tx));
+ }
}
void
-dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
- static const char zerobuf[DN_MAX_BONUSLEN] = {0};
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
while (dn = list_head(list)) {
- dmu_object_type_t bonustype;
-
+ int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
- ASSERT(dn->dn_oldphys);
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
/* Allocate the user/groupused objects if necessary. */
- if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
- VERIFY(0 == zap_create_claim(&os->os,
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+ VERIFY(0 == zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
- VERIFY(0 == zap_create_claim(&os->os,
+ VERIFY(0 == zap_create_claim(os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
/*
- * If the object was not previously
- * accounted, pretend that it was free.
+ * We intentionally modify the zap object even if the
+ * net delta is zero. Otherwise
+ * the block of the zap obj could be shared between
+ * datasets but need to be different between them after
+ * a bprewrite.
*/
- if (!(dn->dn_oldphys->dn_flags &
- DNODE_FLAG_USERUSED_ACCOUNTED)) {
- bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
- }
- /*
- * If the object was freed, use the previous bonustype.
- */
- bonustype = dn->dn_phys->dn_bonustype ?
- dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
- ASSERT(dn->dn_phys->dn_type != 0 ||
- (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
- DN_MAX_BONUSLEN) == 0 &&
- DN_USED_BYTES(dn->dn_phys) == 0));
- ASSERT(dn->dn_oldphys->dn_type != 0 ||
- (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
- DN_MAX_BONUSLEN) == 0 &&
- DN_USED_BYTES(dn->dn_oldphys) == 0));
- used_cbs[os->os_phys->os_type](&os->os, bonustype,
- DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
- DN_USED_BYTES(dn->dn_oldphys),
- DN_USED_BYTES(dn->dn_phys), tx);
+ flags = dn->dn_id_flags;
+ ASSERT(flags);
+ if (flags & DN_ID_OLD_EXIST) {
+ do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+ }
+ if (flags & DN_ID_NEW_EXIST) {
+ do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+ dn->dn_phys->dn_flags, dn->dn_newuid,
+ dn->dn_newgid, B_FALSE, tx);
+ }
- /*
- * The mutex is needed here for interlock with dnode_allocate.
- */
mutex_enter(&dn->dn_mtx);
- zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
- dn->dn_oldphys = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ dn->dn_olduid = dn->dn_newuid;
+ dn->dn_oldgid = dn->dn_newgid;
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (dn->dn_bonuslen == 0)
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ else
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
mutex_exit(&dn->dn_mtx);
list_remove(list, dn);
@@ -1101,10 +1292,151 @@ dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
}
}
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned. In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr, **drp;
+ void *data;
+
+ if (db->db_dirtycnt == 0)
+ return (db->db.db_data); /* Nothing is changing */
+
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg == tx->tx_txg)
+ break;
+
+ if (dr == NULL) {
+ data = NULL;
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(dr->dr_dbuf);
+ dn = DB_DNODE(dr->dr_dbuf);
+
+ if (dn->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+
+ DB_DNODE_EXIT(dr->dr_dbuf);
+ }
+
+ return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ void *data = NULL;
+ dmu_buf_impl_t *db = NULL;
+ uint64_t *user, *group;
+ int flags = dn->dn_id_flags;
+ int error;
+ boolean_t have_spill = B_FALSE;
+
+ if (!dmu_objset_userused_enabled(dn->dn_objset))
+ return;
+
+ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+ DN_ID_CHKED_SPILL)))
+ return;
+
+ if (before && dn->dn_bonuslen != 0)
+ data = DN_BONUS(dn->dn_phys);
+ else if (!before && dn->dn_bonuslen != 0) {
+ if (dn->dn_bonus) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ data = dmu_objset_userquota_find_data(db, tx);
+ } else {
+ data = DN_BONUS(dn->dn_phys);
+ }
+ } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+ int rf = 0;
+
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ error = dmu_spill_hold_by_dnode(dn,
+ rf | DB_RF_MUST_SUCCEED,
+ FTAG, (dmu_buf_t **)&db);
+ ASSERT(error == 0);
+ mutex_enter(&db->db_mtx);
+ data = (before) ? db->db.db_data :
+ dmu_objset_userquota_find_data(db, tx);
+ have_spill = B_TRUE;
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+
+ if (before) {
+ ASSERT(data);
+ user = &dn->dn_olduid;
+ group = &dn->dn_oldgid;
+ } else if (data) {
+ user = &dn->dn_newuid;
+ group = &dn->dn_newgid;
+ }
+
+ /*
+ * Must always call the callback in case the object
+ * type has changed and that type isn't an object type to track
+ */
+ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+ user, group);
+
+ /*
+ * Preserve existing uid/gid when the callback can't determine
+ * what the new uid/gid are and the callback returned EEXIST.
+ * The EEXIST error tells us to just use the existing uid/gid.
+ * If we don't know what the old values are then just assign
+ * them to 0, since that is a new file being created.
+ */
+ if (!before && data == NULL && error == EEXIST) {
+ if (flags & DN_ID_OLD_EXIST) {
+ dn->dn_newuid = dn->dn_olduid;
+ dn->dn_newgid = dn->dn_oldgid;
+ } else {
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ }
+ error = 0;
+ }
+
+ if (db)
+ mutex_exit(&db->db_mtx);
+
+ mutex_enter(&dn->dn_mtx);
+ if (error == 0 && before)
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (error == 0 && !before)
+ dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+ if (have_spill) {
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ } else {
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (have_spill)
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
boolean_t
dmu_objset_userspace_present(objset_t *os)
{
- return (os->os->os_phys->os_flags &
+ return (os->os_phys->os_flags &
OBJSET_FLAG_USERACCOUNTING_COMPLETE);
}
@@ -1116,7 +1448,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
if (dmu_objset_userspace_present(os))
return (0);
- if (!dmu_objset_userused_enabled(os->os))
+ if (!dmu_objset_userused_enabled(os))
return (ENOTSUP);
if (dmu_objset_is_snapshot(os))
return (EINVAL);
@@ -1152,7 +1484,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
dmu_tx_commit(tx);
}
- os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
@@ -1161,35 +1493,35 @@ void
dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
- dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+ dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
usedobjsp, availobjsp);
}
uint64_t
dmu_objset_fsid_guid(objset_t *os)
{
- return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+ return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
}
void
dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
{
- stat->dds_type = os->os->os_phys->os_type;
- if (os->os->os_dsl_dataset)
- dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+ stat->dds_type = os->os_phys->os_type;
+ if (os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
}
void
dmu_objset_stats(objset_t *os, nvlist_t *nv)
{
- ASSERT(os->os->os_dsl_dataset ||
- os->os->os_phys->os_type == DMU_OST_META);
+ ASSERT(os->os_dsl_dataset ||
+ os->os_phys->os_type == DMU_OST_META);
- if (os->os->os_dsl_dataset != NULL)
- dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+ if (os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os_dsl_dataset, nv);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
- os->os->os_phys->os_type);
+ os->os_phys->os_type);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
dmu_objset_userspace_present(os));
}
@@ -1197,8 +1529,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
int
dmu_objset_is_snapshot(objset_t *os)
{
- if (os->os->os_dsl_dataset != NULL)
- return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ if (os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
else
return (B_FALSE);
}
@@ -1207,7 +1539,7 @@ int
dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
boolean_t *conflict)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
uint64_t ignored;
if (ds->ds_phys->ds_snapnames_zapobj == 0)
@@ -1222,7 +1554,7 @@ int
dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
zap_cursor_t cursor;
zap_attribute_t attr;
@@ -1259,12 +1591,12 @@ int
dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp)
{
- dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
zap_cursor_t cursor;
zap_attribute_t attr;
/* there is no next dir on a snapshot! */
- if (os->os->os_dsl_dataset->ds_object !=
+ if (os->os_dsl_dataset->ds_object !=
dd->dd_phys->dd_head_dataset_obj)
return (ENOENT);
@@ -1293,7 +1625,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
}
struct findarg {
- int (*func)(char *, void *);
+ int (*func)(const char *, void *);
void *arg;
};
@@ -1302,7 +1634,7 @@ static int
findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
{
struct findarg *fa = arg;
- return (fa->func((char *)dsname, fa->arg));
+ return (fa->func(dsname, fa->arg));
}
/*
@@ -1310,7 +1642,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
* Perhaps change all callers to use dmu_objset_find_spa()?
*/
int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
+ int flags)
{
struct findarg fa;
fa.func = func;
@@ -1361,12 +1694,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
ASSERT(attr->za_integer_length == sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strcpy(child, name);
- (void) strcat(child, "/");
- (void) strcat(child, attr->za_name);
+ child = kmem_asprintf("%s/%s", name, attr->za_name);
err = dmu_objset_find_spa(spa, child, func, arg, flags);
- kmem_free(child, MAXPATHLEN);
+ strfree(child);
if (err)
break;
}
@@ -1400,13 +1730,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strcpy(child, name);
- (void) strcat(child, "@");
- (void) strcat(child, attr->za_name);
+ child = kmem_asprintf("%s@%s",
+ name, attr->za_name);
err = func(spa, attr->za_first_integer,
child, arg);
- kmem_free(child, MAXPATHLEN);
+ strfree(child);
if (err)
break;
}
@@ -1429,7 +1757,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
/* ARGSUSED */
int
-dmu_objset_prefetch(char *name, void *arg)
+dmu_objset_prefetch(const char *name, void *arg)
{
dsl_dataset_t *ds;
@@ -1438,16 +1766,14 @@ dmu_objset_prefetch(char *name, void *arg)
if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
mutex_enter(&ds->ds_opening_lock);
- if (!dsl_dataset_get_user_ptr(ds)) {
+ if (ds->ds_objset == NULL) {
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = ds->ds_object;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
+ SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+ ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+ (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds),
&ds->ds_phys->ds_bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
@@ -1463,13 +1789,13 @@ dmu_objset_prefetch(char *name, void *arg)
void
dmu_objset_set_user(objset_t *os, void *user_ptr)
{
- ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
- os->os->os_user_ptr = user_ptr;
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ os->os_user_ptr = user_ptr;
}
void *
dmu_objset_get_user(objset_t *os)
{
- ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
- return (os->os->os_user_ptr);
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ return (os->os_user_ptr);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index ed5afb4e1df5..55451fd931ad 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -33,14 +32,32 @@
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
static char *dmu_recv_tag = "dmu_recv_tag";
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another. Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+ PENDING_NONE,
+ PENDING_FREE,
+ PENDING_FREEOBJECTS
+} pendop_t;
+
struct backuparg {
dmu_replay_record_t *drr;
kthread_t *td;
@@ -48,7 +65,9 @@ struct backuparg {
offset_t *off;
objset_t *os;
zio_cksum_t zc;
+ uint64_t toguid;
int err;
+ pendop_t pending_op;
};
static int
@@ -56,11 +75,9 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
{
struct uio auio;
struct iovec aiov;
-
ASSERT3U(len % 8, ==, 0);
fletcher_4_incremental_native(buf, len, &ba->zc);
-
aiov.iov_base = buf;
aiov.iov_len = len;
auio.uio_iov = &aiov;
@@ -79,7 +96,6 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
ba->err = EOPNOTSUPP;
#endif
*ba->off += len;
-
return (ba->err);
}
@@ -87,29 +103,120 @@ static int
dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
uint64_t length)
{
- /* write a FREE record */
+ struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREE, push it out,
+ * since free block aggregation can only be done for blocks of the
+ * same type (i.e., DRR_FREE records can only be aggregated with
+ * other DRR_FREE records. DRR_FREEOBJECTS records can only be
+ * aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
+ if (ba->pending_op == PENDING_FREE) {
+ /*
+ * There should never be a PENDING_FREE if length is -1
+ * (because dump_dnode is the only place where this
+ * function is called with a -1, and only after flushing
+ * any pending record).
+ */
+ ASSERT(length != -1ULL);
+ /*
+ * Check to see whether this free block can be aggregated
+ * with pending one.
+ */
+ if (drrf->drr_object == object && drrf->drr_offset +
+ drrf->drr_length == offset) {
+ drrf->drr_length += length;
+ return (0);
+ } else {
+ /* not a continuation. Push out pending record */
+ if (dump_bytes(ba, ba->drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ }
+ /* create a FREE record and make it pending */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_FREE;
- ba->drr->drr_u.drr_free.drr_object = object;
- ba->drr->drr_u.drr_free.drr_offset = offset;
- ba->drr->drr_u.drr_free.drr_length = length;
+ drrf->drr_object = object;
+ drrf->drr_offset = offset;
+ drrf->drr_length = length;
+ drrf->drr_toguid = ba->toguid;
+ if (length == -1ULL) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ } else {
+ ba->pending_op = PENDING_FREE;
+ }
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
return (0);
}
static int
dump_data(struct backuparg *ba, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, void *data)
+ uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
+ struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
+
+
+ /*
+ * If there is any kind of pending aggregation (currently either
+ * a grouping of free objects or free blocks), push it out to
+ * the stream, since aggregation can't be done across operations
+ * of different types.
+ */
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
/* write a DATA record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_WRITE;
- ba->drr->drr_u.drr_write.drr_object = object;
- ba->drr->drr_u.drr_write.drr_type = type;
- ba->drr->drr_u.drr_write.drr_offset = offset;
- ba->drr->drr_u.drr_write.drr_length = blksz;
+ drrw->drr_object = object;
+ drrw->drr_type = type;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = ba->toguid;
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
+{
+ struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
+
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
+ /* write a SPILL record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_SPILL;
+ drrs->drr_object = object;
+ drrs->drr_length = blksz;
+ drrs->drr_toguid = ba->toguid;
if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
return (EINTR);
@@ -121,39 +228,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type,
static int
dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
{
+ struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+ * push it out, since free block aggregation can only be done for
+ * blocks of the same type (i.e., DRR_FREE records can only be
+ * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
+ * can only be aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (ba->pending_op != PENDING_NONE &&
+ ba->pending_op != PENDING_FREEOBJECTS) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ if (ba->pending_op == PENDING_FREEOBJECTS) {
+ /*
+ * See whether this free object array can be aggregated
+ * with pending one
+ */
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+ drrfo->drr_numobjs += numobjs;
+ return (0);
+ } else {
+ /* can't be aggregated. Push out pending record */
+ if (dump_bytes(ba, ba->drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+ }
+
/* write a FREEOBJECTS record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_FREEOBJECTS;
- ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
- ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+ drrfo->drr_firstobj = firstobj;
+ drrfo->drr_numobjs = numobjs;
+ drrfo->drr_toguid = ba->toguid;
+
+ ba->pending_op = PENDING_FREEOBJECTS;
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
return (0);
}
static int
dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
{
+ struct drr_object *drro = &(ba->drr->drr_u.drr_object);
+
if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
return (dump_freeobjects(ba, object, 1));
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
/* write an OBJECT record */
bzero(ba->drr, sizeof (dmu_replay_record_t));
ba->drr->drr_type = DRR_OBJECT;
- ba->drr->drr_u.drr_object.drr_object = object;
- ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
- ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
- ba->drr->drr_u.drr_object.drr_blksz =
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
- ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
- ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
- ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ drro->drr_object = object;
+ drro->drr_type = dnp->dn_type;
+ drro->drr_bonustype = dnp->dn_bonustype;
+ drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_checksumtype = dnp->dn_checksum;
+ drro->drr_compress = dnp->dn_compress;
+ drro->drr_toguid = ba->toguid;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
- if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+ if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
return (EINTR);
/* free anything past the end of the file */
@@ -169,9 +317,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/* ARGSUSED */
static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct backuparg *ba = arg;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -180,9 +329,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
- if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+ if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+ DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
- } else if (bp == NULL && zb->zb_object == 0) {
+ } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@@ -198,7 +348,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
- if (arc_read_nolock(NULL, spa, bp,
+ if (dsl_read(NULL, spa, bp, pbuf,
arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
@@ -212,7 +362,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
- } else { /* it's a level-0 block of a regular object */
+ } else if (type == DMU_OT_SA) {
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -222,8 +372,20 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
+ err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ } else { /* it's a level-0 block of a regular object */
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ int blksz = BP_GET_LSIZE(bp);
+
+ if (dsl_read(NULL, spa, bp, pbuf,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, abuf->b_data);
+ blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -235,8 +397,8 @@ int
dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
struct file *fp, offset_t *off)
{
- dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
- dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+ dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
dmu_replay_record_t *drr;
struct backuparg ba;
int err;
@@ -273,10 +435,25 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
drr->drr_type = DRR_BEGIN;
drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
+ DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+ DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+ if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
+ uint64_t version;
+ if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0)
+ return (EINVAL);
+ if (version == ZPL_VERSION_SA) {
+ DMU_SET_FEATUREFLAGS(
+ drr->drr_u.drr_begin.drr_versioninfo,
+ DMU_BACKUP_FEATURE_SA_SPILL);
+ }
+ }
+#endif
+
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
- drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
if (fromorigin)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
@@ -297,9 +474,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
ba.fp = fp;
ba.os = tosnap;
ba.off = off;
+ ba.toguid = ds->ds_phys->ds_guid;
ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+ ba.pending_op = PENDING_NONE;
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
kmem_free(drr, sizeof (dmu_replay_record_t));
return (ba.err);
}
@@ -307,6 +486,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
backup_cb, &ba);
+ if (ba.pending_op != PENDING_NONE)
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
+ err = EINTR;
+
if (err) {
if (err == EINTR && ba.err)
err = ba.err;
@@ -317,8 +500,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
bzero(drr, sizeof (dmu_replay_record_t));
drr->drr_type = DRR_END;
drr->drr_u.drr_end.drr_checksum = ba.zc;
+ drr->drr_u.drr_end.drr_toguid = ba.toguid;
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
kmem_free(drr, sizeof (dmu_replay_record_t));
return (ba.err);
}
@@ -339,33 +523,12 @@ struct recvbeginsyncarg {
uint64_t dsflags;
char clonelastname[MAXNAMELEN];
dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
+ cred_t *cr;
};
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
- cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
-
- /* This should always work, since we just created it */
- /* XXX - create should return an owned ds */
- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
- DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
-
- if (type != DMU_OST_NONE) {
- (void) dmu_objset_create_impl(dp->dp_spa,
- ds, &ds->ds_phys->ds_bp, type, tx);
- }
-
- spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
-
- return (ds);
-}
-
/* ARGSUSED */
static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -383,7 +546,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* make sure it's a snap in the same pool */
if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
+ if (!dsl_dataset_is_snapshot(rbsa->origin))
return (EINVAL);
if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
return (ENODEV);
@@ -393,77 +556,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct recvbeginsyncarg *rbsa = arg2;
uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
+ /* Create and open new dataset. */
dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
- rbsa->origin, flags, cr, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
+ rbsa->origin, flags, rbsa->cr, tx);
+ VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
+ B_TRUE, dmu_recv_tag, &rbsa->ds));
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- int err;
-
- /* must be a head ds */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /* must not be a clone ds */
- if (dsl_dir_is_clone(ds->ds_dir))
- return (EINVAL);
-
- err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
- if (err)
- return (err);
-
- if (rbsa->origin) {
- /* make sure it's a snap in the same pool */
- if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
- return (EXDEV);
- if (rbsa->origin->ds_phys->ds_num_children == 0)
- return (EINVAL);
- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
+ if (rbsa->origin == NULL) {
+ (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
+ rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
}
- return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct recvbeginsyncarg *rbsa = arg2;
- dsl_dir_t *dd = ds->ds_dir;
- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
- uint64_t dsobj;
-
- /*
- * NB: caller must provide an extra hold on the dsl_dir_t, so it
- * won't go away when dsl_dataset_destroy_sync() closes the
- * dataset.
- */
- dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
-
- dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
-
- rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
- rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+ spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
+ dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
}
/* ARGSUSED */
static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -474,77 +591,105 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
return (ETXTBSY);
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
-
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
- return (ENODEV);
-
- /* temporary clone name must not exist */
+ /* new snapshot name must not exist */
err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_dir->dd_phys->dd_child_dir_zapobj,
- rbsa->clonelastname, 8, 1, &val);
+ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
if (err == 0)
return (EEXIST);
if (err != ENOENT)
return (err);
- /* new snapshot name must not exist */
+ if (rbsa->fromguid) {
+ /* if incremental, most recent snapshot must match fromguid */
+ if (ds->ds_prev == NULL)
+ return (ENODEV);
+
+ /*
+ * most recent snapshot must match fromguid, or there are no
+ * changes since the fromguid one
+ */
+ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+ uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
+ uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
+ while (obj != 0) {
+ dsl_dataset_t *snap;
+ err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ obj, FTAG, &snap);
+ if (err)
+ return (ENODEV);
+ if (snap->ds_phys->ds_creation_txg < birth) {
+ dsl_dataset_rele(snap, FTAG);
+ return (ENODEV);
+ }
+ if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+ dsl_dataset_rele(snap, FTAG);
+ break; /* it's ok */
+ }
+ obj = snap->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ }
+ if (obj == 0)
+ return (ENODEV);
+ }
+ } else {
+ /* if full, most recent snapshot must be $ORIGIN */
+ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+ return (ENODEV);
+ }
+
+ /* temporary clone name must not exist */
err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+ ds->ds_dir->dd_phys->dd_child_dir_zapobj,
+ rbsa->clonelastname, 8, 1, &val);
if (err == 0)
return (EEXIST);
if (err != ENOENT)
return (err);
+
return (0);
}
/* ARGSUSED */
static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ohds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
dsl_pool_t *dp = ohds->ds_dir->dd_pool;
- dsl_dataset_t *ods, *cds;
+ dsl_dataset_t *cds;
uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
- /* create the temporary clone */
- VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
- FTAG, &ods));
- dsobj = dsl_dataset_create_sync(ohds->ds_dir,
- rbsa->clonelastname, ods, flags, cr, tx);
- dsl_dataset_rele(ods, FTAG);
-
- /* open the temporary clone */
- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
- DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
+ /* create and open the temporary clone */
+ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
+ ohds->ds_prev, flags, rbsa->cr, tx);
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
- /* copy the refquota from the target fs to the clone */
- if (ohds->ds_quota > 0)
- dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
+ /*
+ * If we actually created a non-clone, we need to create the
+ * objset in our new dataset.
+ */
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+ }
rbsa->ds = cds;
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+ spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
+ dp->dp_spa, tx, "dataset = %lld", dsobj);
}
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+static boolean_t
+dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
{
- dsl_dataset_t *ds = arg1;
+ int featureflags;
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
- ds->ds_object);
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
}
/*
@@ -552,13 +697,13 @@ recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
{
int err = 0;
boolean_t byteswap;
- struct recvbeginsyncarg rbsa;
- uint64_t version;
+ struct recvbeginsyncarg rbsa = { 0 };
+ uint64_t versioninfo;
int flags;
dsl_dataset_t *ds;
@@ -571,22 +716,23 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
rbsa.tofs = tofs;
rbsa.tosnap = tosnap;
- rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+ rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
rbsa.fromguid = drrb->drr_fromguid;
rbsa.type = drrb->drr_type;
rbsa.tag = FTAG;
rbsa.dsflags = 0;
- version = drrb->drr_version;
+ rbsa.cr = CRED();
+ versioninfo = drrb->drr_versioninfo;
flags = drrb->drr_flags;
if (byteswap) {
rbsa.type = BSWAP_32(rbsa.type);
rbsa.fromguid = BSWAP_64(rbsa.fromguid);
- version = BSWAP_64(version);
+ versioninfo = BSWAP_64(versioninfo);
flags = BSWAP_32(flags);
}
- if (version != DMU_BACKUP_STREAM_VERSION ||
+ if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
rbsa.type >= DMU_OST_NUMTYPES ||
((flags & DRR_FLAG_CLONE) && origin == NULL))
return (EINVAL);
@@ -597,102 +743,81 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
bzero(drc, sizeof (dmu_recv_cookie_t));
drc->drc_drrb = drrb;
drc->drc_tosnap = tosnap;
+ drc->drc_top_ds = top_ds;
drc->drc_force = force;
/*
* Process the begin in syncing context.
*/
- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
- /* offline incremental receive */
- err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
- if (err)
- return (err);
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- rbsa.fromguid) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ /* open the dataset we are logically receiving into */
+ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+ if (err == 0) {
+ if (dmu_recv_verify_features(ds, drrb)) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (ENOTSUP);
}
- rbsa.force = B_FALSE;
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_offline_incremental_sync, ds, &rbsa, 1);
- if (err) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (err);
+ /* target fs already exists; recv into temp clone */
+
+ /* Can't recv a clone into an existing fs */
+ if (flags & DRR_FLAG_CLONE) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (EINVAL);
+ }
+
+ /* must not have an incremental recv already in progress */
+ if (!mutex_tryenter(&ds->ds_recvlock)) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (EBUSY);
}
- drc->drc_logical_ds = drc->drc_real_ds = ds;
- } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
- /* online incremental receive */
/* tmp clone name is: tofs/%tosnap" */
(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
"%%%s", tosnap);
-
- /* open the dataset we are logically receiving into */
- err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
- if (err)
- return (err);
-
rbsa.force = force;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_online_incremental_sync, ds, &rbsa, 5);
+ recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
if (err) {
+ mutex_exit(&ds->ds_recvlock);
dsl_dataset_rele(ds, dmu_recv_tag);
return (err);
}
drc->drc_logical_ds = ds;
drc->drc_real_ds = rbsa.ds;
- } else {
- /* create new fs -- full backup or clone */
- dsl_dir_t *dd = NULL;
- const char *tail;
+ } else if (err == ENOENT) {
+ /* target fs does not exist; must be a full backup or clone */
+ char *cp;
- err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+ /*
+ * If it's a non-clone incremental, we are missing the
+ * target fs, so fail the recv.
+ */
+ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
+ return (ENOENT);
+
+ /* Open the parent of tofs */
+ cp = strrchr(tofs, '/');
+ *cp = '\0';
+ err = dsl_dataset_hold(tofs, FTAG, &ds);
+ *cp = '/';
if (err)
return (err);
- if (tail == NULL) {
- if (!force) {
- dsl_dir_close(dd, FTAG);
- return (EEXIST);
- }
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- err = dsl_dataset_own_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj,
- DS_MODE_INCONSISTENT, FTAG, &ds);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- if (err) {
- dsl_dir_close(dd, FTAG);
- return (err);
- }
- dsl_dataset_make_exclusive(ds, FTAG);
- err = dsl_sync_task_do(dd->dd_pool,
- recv_full_existing_check,
- recv_full_existing_sync, ds, &rbsa, 5);
- dsl_dataset_disown(ds, FTAG);
- } else {
- err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
- recv_full_sync, dd, &rbsa, 5);
+ if (dmu_recv_verify_features(ds, drrb)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOTSUP);
}
- dsl_dir_close(dd, FTAG);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+ dsl_dataset_rele(ds, FTAG);
if (err)
return (err);
drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
drc->drc_newfs = B_TRUE;
}
- return (0);
+ return (err);
}
struct restorearg {
@@ -704,10 +829,100 @@ struct restorearg {
uint64_t voff;
int bufsize; /* amount of memory allocated for buf */
zio_cksum_t cksum;
+ avl_tree_t *guid_to_ds_map;
};
+typedef struct guid_map_entry {
+ uint64_t guid;
+ dsl_dataset_t *gme_ds;
+ avl_node_t avlnode;
+} guid_map_entry_t;
+
static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
+guid_compare(const void *arg1, const void *arg2)
+{
+ const guid_map_entry_t *gmep1 = arg1;
+ const guid_map_entry_t *gmep2 = arg2;
+
+ if (gmep1->guid < gmep2->guid)
+ return (-1);
+ else if (gmep1->guid > gmep2->guid)
+ return (1);
+ return (0);
+}
+
+/*
+ * This function is a callback used by dmu_objset_find() (which
+ * enumerates the object sets) to build an avl tree that maps guids
+ * to datasets. The resulting table is used when processing DRR_WRITE_BYREF
+ * send stream records. These records, which are used in dedup'ed
+ * streams, do not contain data themselves, but refer to a copy
+ * of the data block that has already been written because it was
+ * earlier in the stream. That previous copy is identified by the
+ * guid of the dataset with the referenced data.
+ */
+int
+find_ds_by_guid(const char *name, void *arg)
+{
+ avl_tree_t *guid_map = arg;
+ dsl_dataset_t *ds, *snapds;
+ guid_map_entry_t *gmep;
+ dsl_pool_t *dp;
+ int err;
+ uint64_t lastobj, firstobj;
+
+ if (dsl_dataset_hold(name, FTAG, &ds) != 0)
+ return (0);
+
+ dp = ds->ds_dir->dd_pool;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
+ lastobj = ds->ds_phys->ds_prev_snap_obj;
+
+ while (lastobj != firstobj) {
+ err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
+ if (err) {
+ /*
+ * Skip this snapshot and move on. It's not
+ * clear why this would ever happen, but the
+ * remainder of the snapshot streadm can be
+ * processed.
+ */
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
+ gmep->guid = snapds->ds_phys->ds_guid;
+ gmep->gme_ds = snapds;
+ avl_add(guid_map, gmep);
+ lastobj = snapds->ds_phys->ds_prev_snap_obj;
+ }
+
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (0);
+}
+
+static void
+free_guid_map_onexit(void *arg)
+{
+ avl_tree_t *ca = arg;
+ void *cookie = NULL;
+ guid_map_entry_t *gmep;
+
+ while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+ dsl_dataset_rele(gmep->gme_ds, ca);
+ kmem_free(gmep, sizeof (guid_map_entry_t));
+ }
+ avl_destroy(ca);
+ kmem_free(ca, sizeof (avl_tree_t));
+}
+
+static int
+restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
{
struct uio auio;
struct iovec aiov;
@@ -742,7 +957,7 @@ restore_read(struct restorearg *ra, int len)
ASSERT3U(len % 8, ==, 0);
while (done < len) {
- int resid;
+ ssize_t resid;
ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
len - done, ra->voff, &resid);
@@ -774,7 +989,7 @@ backup_byteswap(dmu_replay_record_t *drr)
switch (drr->drr_type) {
case DRR_BEGIN:
DO64(drr_begin.drr_magic);
- DO64(drr_begin.drr_version);
+ DO64(drr_begin.drr_versioninfo);
DO64(drr_begin.drr_creation_time);
DO32(drr_begin.drr_type);
DO32(drr_begin.drr_flags);
@@ -788,27 +1003,56 @@ backup_byteswap(dmu_replay_record_t *drr)
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
DO32(drr_object.drr_bonuslen);
+ DO64(drr_object.drr_toguid);
break;
case DRR_FREEOBJECTS:
DO64(drr_freeobjects.drr_firstobj);
DO64(drr_freeobjects.drr_numobjs);
+ DO64(drr_freeobjects.drr_toguid);
break;
case DRR_WRITE:
DO64(drr_write.drr_object);
DO32(drr_write.drr_type);
DO64(drr_write.drr_offset);
DO64(drr_write.drr_length);
+ DO64(drr_write.drr_toguid);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
+ DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+ DO64(drr_write.drr_key.ddk_prop);
+ break;
+ case DRR_WRITE_BYREF:
+ DO64(drr_write_byref.drr_object);
+ DO64(drr_write_byref.drr_offset);
+ DO64(drr_write_byref.drr_length);
+ DO64(drr_write_byref.drr_toguid);
+ DO64(drr_write_byref.drr_refguid);
+ DO64(drr_write_byref.drr_refobject);
+ DO64(drr_write_byref.drr_refoffset);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
+ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+ DO64(drr_write_byref.drr_key.ddk_prop);
break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
DO64(drr_free.drr_length);
+ DO64(drr_free.drr_toguid);
+ break;
+ case DRR_SPILL:
+ DO64(drr_spill.drr_object);
+ DO64(drr_spill.drr_length);
+ DO64(drr_spill.drr_toguid);
break;
case DRR_END:
DO64(drr_end.drr_checksum.zc_word[0]);
DO64(drr_end.drr_checksum.zc_word[1]);
DO64(drr_end.drr_checksum.zc_word[2]);
DO64(drr_end.drr_checksum.zc_word[3]);
+ DO64(drr_end.drr_toguid);
break;
}
#undef DO64
@@ -825,7 +1069,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
if (drro->drr_type == DMU_OT_NONE ||
drro->drr_type >= DMU_OT_NUMTYPES ||
drro->drr_bonustype >= DMU_OT_NUMTYPES ||
- drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
@@ -864,8 +1108,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen);
}
- if (err)
+ if (err) {
return (EINVAL);
+ }
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, drro->drr_object);
@@ -875,7 +1120,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
return (err);
}
- dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
+ tx);
dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
if (data != NULL) {
@@ -957,6 +1203,114 @@ restore_write(struct restorearg *ra, objset_t *os,
return (0);
}
+/*
+ * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream. This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+restore_write_byref(struct restorearg *ra, objset_t *os,
+ struct drr_write_byref *drrwbr)
+{
+ dmu_tx_t *tx;
+ int err;
+ guid_map_entry_t gmesrch;
+ guid_map_entry_t *gmep;
+ avl_index_t where;
+ objset_t *ref_os = NULL;
+ dmu_buf_t *dbp;
+
+ if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+ return (EINVAL);
+
+ /*
+ * If the GUID of the referenced dataset is different from the
+ * GUID of the target dataset, find the referenced dataset.
+ */
+ if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+ gmesrch.guid = drrwbr->drr_refguid;
+ if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+ &where)) == NULL) {
+ return (EINVAL);
+ }
+ if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+ return (EINVAL);
+ } else {
+ ref_os = os;
+ }
+
+ if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+ return (err);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_write(os, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+ dmu_buf_rele(dbp, FTAG);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+{
+ dmu_tx_t *tx;
+ void *data;
+ dmu_buf_t *db, *db_spill;
+ int err;
+
+ if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+ drrs->drr_length > SPA_MAXBLOCKSIZE)
+ return (EINVAL);
+
+ data = restore_read(ra, drrs->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+ if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_spill(tx, db->db_object);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_buf_will_dirty(db_spill, tx);
+
+ if (db_spill->db_size < drrs->drr_length)
+ VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ drrs->drr_length, tx));
+ bcopy(data, db_spill->db_data, drrs->drr_length);
+
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
/* ARGSUSED */
static int
restore_free(struct restorearg *ra, objset_t *os,
@@ -976,37 +1330,18 @@ restore_free(struct restorearg *ra, objset_t *os,
return (err);
}
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
- if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
- /*
- * online incremental or new fs: destroy the fs (which
- * may be a clone) that we created
- */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
- if (drc->drc_real_ds != drc->drc_logical_ds)
- dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
- } else {
- /*
- * offline incremental: rollback to most recent snapshot.
- */
- (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
- dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
- }
-}
-
/*
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
-dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep)
{
- kthread_t *td = curthread;
struct restorearg ra = { 0 };
dmu_replay_record_t *drr;
objset_t *os;
zio_cksum_t pcksum;
+ int featureflags;
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
ra.byteswap = TRUE;
@@ -1031,30 +1366,69 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
if (ra.byteswap) {
struct drr_begin *drrb = drc->drc_drrb;
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
drrb->drr_type = BSWAP_32(drrb->drr_type);
drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
}
- ra.td = td;
+ ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
ra.bufsize = 1<<20;
ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
- ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+ ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+ DMU_SUBSTREAM);
ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
/*
* Open the objset we are modifying.
*/
- VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
+ VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+ featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+ /* if this stream is dedup'ed, set up the avl tree for guid mapping */
+ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+ minor_t minor;
+
+ if (cleanup_fd == -1) {
+ ra.err = EBADF;
+ goto out;
+ }
+ ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (ra.err) {
+ cleanup_fd = -1;
+ goto out;
+ }
+
+ if (*action_handlep == 0) {
+ ra.guid_to_ds_map =
+ kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(ra.guid_to_ds_map, guid_compare,
+ sizeof (guid_map_entry_t),
+ offsetof(guid_map_entry_t, avlnode));
+ (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+ (void *)ra.guid_to_ds_map,
+ DS_FIND_CHILDREN);
+ ra.err = zfs_onexit_add_cb(minor,
+ free_guid_map_onexit, ra.guid_to_ds_map,
+ action_handlep);
+ if (ra.err)
+ goto out;
+ } else {
+ ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+ (void **)&ra.guid_to_ds_map);
+ if (ra.err)
+ goto out;
+ }
+ }
+
/*
* Read records and process them.
*/
@@ -1094,6 +1468,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ra.err = restore_write(&ra, os, &drrw);
break;
}
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref drrwbr =
+ drr->drr_u.drr_write_byref;
+ ra.err = restore_write_byref(&ra, os, &drrwbr);
+ break;
+ }
case DRR_FREE:
{
struct drr_free drrf = drr->drr_u.drr_free;
@@ -1112,6 +1493,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ra.err = ECKSUM;
goto out;
}
+ case DRR_SPILL:
+ {
+ struct drr_spill drrs = drr->drr_u.drr_spill;
+ ra.err = restore_spill(&ra, os, &drrs);
+ break;
+ }
default:
ra.err = EINVAL;
goto out;
@@ -1121,15 +1508,22 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
ASSERT(ra.err != 0);
out:
- dmu_objset_close(os);
+ if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+ zfs_onexit_fd_rele(cleanup_fd);
if (ra.err != 0) {
/*
- * rollback or destroy what we created, so we don't
- * leave it in the restoring state.
+ * destroy what we created, so we don't leave it in the
+ * inconsistent restoring state.
*/
txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
- dmu_recv_abort_cleanup(drc);
+
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ if (drc->drc_real_ds != drc->drc_logical_ds) {
+ mutex_exit(&drc->drc_logical_ds->ds_recvlock);
+ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+ }
}
kmem_free(ra.buf, ra.bufsize);
@@ -1153,12 +1547,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct recvendsyncarg *resa = arg2;
- dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+ dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
/* set snapshot's creation time and guid */
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
@@ -1170,35 +1564,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
}
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
{
struct recvendsyncarg resa;
dsl_dataset_t *ds = drc->drc_logical_ds;
int err;
/*
- * XXX hack; seems the ds is still dirty and
- * dsl_pool_zil_clean() expects it to have a ds_user_ptr
- * (and zil), but clone_swap() can close it.
+ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+ * expects it to have a ds_user_ptr (and zil), but clone_swap()
+ * can close it.
*/
txg_wait_synced(ds->ds_dir->dd_pool, 0);
- if (ds != drc->drc_real_ds) {
- /* we are doing an online recv */
- if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
- err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
- drc->drc_force);
- if (err)
- dsl_dataset_disown(ds, dmu_recv_tag);
- } else {
- err = EBUSY;
- dsl_dataset_rele(ds, dmu_recv_tag);
- }
- /* dsl_dataset_destroy() will disown the ds */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+ err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+ drc->drc_force);
if (err)
- return (err);
+ goto out;
+ } else {
+ mutex_exit(&ds->ds_recvlock);
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ return (EBUSY);
}
resa.creation_time = drc->drc_drrb->drr_creation_time;
@@ -1208,16 +1598,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
recv_end_check, recv_end_sync, ds, &resa, 3);
if (err) {
- if (drc->drc_newfs) {
- ASSERT(ds == drc->drc_real_ds);
- (void) dsl_dataset_destroy(ds, dmu_recv_tag);
- return (err);
- } else {
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
- }
+ /* swap back */
+ (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
}
- /* release the hold from dmu_recv_begin */
+out:
+ mutex_exit(&ds->ds_recvlock);
dsl_dataset_disown(ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
return (err);
}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+ struct recvendsyncarg resa;
+ dsl_dataset_t *ds = drc->drc_logical_ds;
+ int err;
+
+ /*
+ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+ * expects it to have a ds_user_ptr (and zil), but clone_swap()
+ * can close it.
+ */
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+ resa.creation_time = drc->drc_drrb->drr_creation_time;
+ resa.toguid = drc->drc_drrb->drr_toguid;
+ resa.tosnap = drc->drc_tosnap;
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_end_check, recv_end_sync, ds, &resa, 3);
+ if (err) {
+ /* clean up the fs we just recv'd into */
+ (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
+ } else {
+ /* release the hold from dmu_recv_begin */
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ }
+ return (err);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+ if (drc->drc_logical_ds != drc->drc_real_ds)
+ return (dmu_recv_existing_end(drc));
+ else
+ return (dmu_recv_new_end(drc));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 89cbfad29f84..023f90e12e34 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -33,17 +32,13 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/callb.h>
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
+int zfs_pd_blks_max = 100;
-struct prefetch_data {
+typedef struct prefetch_data {
kmutex_t pd_mtx;
kcondvar_t pd_cv;
int pd_blks_max;
@@ -51,47 +46,46 @@ struct prefetch_data {
int pd_flags;
boolean_t pd_cancel;
boolean_t pd_exited;
-};
+} prefetch_data_t;
-struct traverse_data {
+typedef struct traverse_data {
spa_t *td_spa;
uint64_t td_objset;
blkptr_t *td_rootbp;
uint64_t td_min_txg;
int td_flags;
- struct prefetch_data *td_pfd;
+ prefetch_data_t *td_pfd;
blkptr_cb_t *td_func;
void *td_arg;
-};
+} traverse_data_t;
-static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object);
-/* ARGSUSED */
-static void
+static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
- return;
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
- zb.zb_objset = td->td_objset;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
+
+ return (0);
}
-/* ARGSUSED */
-static void
+static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
if (lrc->lrc_txtype == TX_WRITE) {
lr_write_t *lr = (lr_write_t *)lrc;
@@ -99,28 +93,29 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
- zb.zb_objset = td->td_objset;
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
+ td->td_arg);
}
+ return (0);
}
static void
-traverse_zil(struct traverse_data *td, zil_header_t *zh)
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
{
uint64_t claim_txg = zh->zh_claim_txg;
zilog_t *zilog;
/*
* We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
+ * replayed; plus, in read-only mode, blocks that are already stable.
*/
if (claim_txg == 0 && spa_writeable(td->td_spa))
return;
@@ -134,16 +129,18 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
}
static int
-traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
- int err = 0;
+ int err = 0, lasterr = 0;
arc_buf_t *buf = NULL;
- struct prefetch_data *pd = td->td_pfd;
+ prefetch_data_t *pd = td->td_pfd;
+ boolean_t hard = td->td_flags & TRAVERSE_HARD;
if (bp->blk_birth == 0) {
- err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+ td->td_arg);
return (err);
}
@@ -163,7 +160,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
}
if (td->td_flags & TRAVERSE_PRE) {
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
if (err)
return (err);
}
@@ -174,7 +174,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -187,15 +187,18 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, buf, cbp, &czb);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -203,33 +206,43 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
/* recursively visitbp() blocks below this */
dnp = buf->b_data;
- for (i = 0; i < epb && err == 0; i++, dnp++) {
+ for (i = 0; i < epb; i++, dnp++) {
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
zb->zb_blkid * epb + i);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
objset_phys_t *osp;
dnode_phys_t *dnp;
- err = arc_read_nolock(NULL, td->td_spa, bp,
+ err = dsl_read_nolock(NULL, td->td_spa, bp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
osp = buf->b_data;
- traverse_zil(td, &osp->os_zil_header);
-
dnp = &osp->os_meta_dnode;
- err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ if (err && hard) {
+ lasterr = err;
+ err = 0;
+ }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
DMU_USERUSED_OBJECT);
}
+ if (err && hard) {
+ lasterr = err;
+ err = 0;
+ }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@@ -240,35 +253,54 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
- if (err == 0 && (td->td_flags & TRAVERSE_POST))
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
+ }
- return (err);
+ return (err != 0 ? err : lasterr);
}
static int
-traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
{
- int j, err = 0;
+ int j, err = 0, lasterr = 0;
zbookmark_t czb;
+ boolean_t hard = (td->td_flags & TRAVERSE_HARD);
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_blkptr[j], &czb);
- if (err)
- break;
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
}
- return (err);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset,
+ object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_spill, &czb);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ }
+ }
+ return (err != 0 ? err : lasterr);
}
/* ARGSUSED */
static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+ void *arg)
{
- struct prefetch_data *pfd = arg;
+ prefetch_data_t *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
ASSERT(pfd->pd_blks_fetched >= 0);
@@ -276,7 +308,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
return (EINTR);
if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
return (0);
mutex_enter(&pfd->pd_mtx);
@@ -286,7 +319,7 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
- (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+ (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, zb);
@@ -297,15 +330,16 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
static void
traverse_prefetch_thread(void *arg)
{
- struct traverse_data *td_main = arg;
- struct traverse_data td = *td_main;
+ traverse_data_t *td_main = arg;
+ traverse_data_t td = *td_main;
zbookmark_t czb;
td.td_func = traverse_prefetcher;
td.td_arg = td_main->td_pfd;
td.td_pfd = NULL;
- SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -319,16 +353,16 @@ traverse_prefetch_thread(void *arg)
* in syncing context).
*/
static int
-traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
{
- struct traverse_data td;
- struct prefetch_data pd = { 0 };
+ traverse_data_t td;
+ prefetch_data_t pd = { 0 };
zbookmark_t czb;
int err;
td.td_spa = spa;
- td.td_objset = objset;
+ td.td_objset = ds ? ds->ds_object : 0;
td.td_rootbp = rootbp;
td.td_min_txg = txg_start;
td.td_func = func;
@@ -336,17 +370,29 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
td.td_pfd = &pd;
td.td_flags = flags;
- pd.pd_blks_max = 100;
+ pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+ /* See comment on ZIL traversal in dsl_scan_visitds. */
+ if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
+ objset_t *os;
+
+ err = dmu_objset_from_ds(ds, &os);
+ if (err)
+ return (err);
+
+ traverse_zil(&td, &os->os_zil_header);
+ }
+
if (!(flags & TRAVERSE_PREFETCH) ||
0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
&td, TQ_NOQUEUE))
pd.pd_exited = B_TRUE;
- SET_BOOKMARK(&czb, objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
mutex_enter(&pd.pd_mtx);
@@ -370,7 +416,7 @@ int
traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
- return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
&ds->ds_phys->ds_bp, txg_start, flags, func, arg));
}
@@ -378,43 +424,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
* NB: pool must not be changing on-disk (eg, from zdb or sync context).
*/
int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
{
- int err;
+ int err, lasterr = 0;
uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
+ boolean_t hard = (flags & TRAVERSE_HARD);
/* visit the MOS */
- err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
- 0, TRAVERSE_PRE, func, arg);
+ err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
+ txg_start, flags, func, arg);
if (err)
return (err);
/* visit each dataset */
- for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+ for (obj = 1; err == 0 || (err != ESRCH && hard);
+ err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
- if (err)
- return (err);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ continue;
+ }
if (doi.doi_type == DMU_OT_DSL_DATASET) {
dsl_dataset_t *ds;
+ uint64_t txg = txg_start;
+
rw_enter(&dp->dp_config_rwlock, RW_READER);
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
rw_exit(&dp->dp_config_rwlock);
- if (err)
- return (err);
- err = traverse_dataset(ds,
- ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
- func, arg);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ continue;
+ }
+ if (ds->ds_phys->ds_prev_snap_txg > txg)
+ txg = ds->ds_phys->ds_prev_snap_txg;
+ err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
- if (err)
- return (err);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
+ }
}
}
if (err == ESRCH)
err = 0;
- return (err);
+ return (err != 0 ? err : lasterr);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index b6a5cdbb89cd..81b8436707ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -33,7 +32,10 @@
#include <sys/dsl_pool.h>
#include <sys/zap_impl.h> /* for fzap_default_block_shift */
#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
+#include <sys/varargs.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
@@ -48,6 +50,8 @@ dmu_tx_create_dd(dsl_dir_t *dd)
tx->tx_pool = dd->dd_pool;
list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
offsetof(dmu_tx_hold_t, txh_node));
+ list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
#ifdef ZFS_DEBUG
refcount_create(&tx->tx_space_written);
refcount_create(&tx->tx_space_freed);
@@ -58,9 +62,9 @@ dmu_tx_create_dd(dsl_dir_t *dd)
dmu_tx_t *
dmu_tx_create(objset_t *os)
{
- dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
tx->tx_objset = os;
- tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
return (tx);
}
@@ -98,7 +102,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
int err;
if (object != DMU_NEW_OBJECT) {
- err = dnode_hold(os->os, object, tx, &dn);
+ err = dnode_hold(os, object, tx, &dn);
if (err) {
tx->tx_err = err;
return (NULL);
@@ -161,38 +165,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
}
static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
- boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+ int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
{
- int i = db->db_level + 1;
- dnode_t *dn = db->db_dnode;
-
- if (i >= dn->dn_nlevels)
+ objset_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent = NULL;
+ blkptr_t *bp = NULL;
+ uint64_t space;
+
+ if (level >= dn->dn_nlevels || history[level] == blkid)
return;
- db = db->db_parent;
- if (db == NULL) {
- uint64_t lvls = dn->dn_nlevels - i;
+ history[level] = blkid;
- txh->txh_space_towrite += lvls << dn->dn_indblkshift;
- return;
+ space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+ if (db == NULL || db == dn->dn_dbuf) {
+ ASSERT(level != 0);
+ db = NULL;
+ } else {
+ ASSERT(DB_DNODE(db) == dn);
+ ASSERT(db->db_level == level);
+ ASSERT(db->db.db_size == space);
+ ASSERT(db->db_blkid == blkid);
+ bp = db->db_blkptr;
+ parent = db->db_parent;
}
- if (db != history[i]) {
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- uint64_t space = 1ULL << dn->dn_indblkshift;
+ freeable = (bp && (freeable ||
+ dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
- freeable = (db->db_blkptr && (freeable ||
- dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
- if (freeable)
- txh->txh_space_tooverwrite += space;
- else
- txh->txh_space_towrite += space;
- if (db->db_blkptr)
- txh->txh_space_tounref += space;
- history[i] = db;
- dmu_tx_count_indirects(txh, db, freeable, history);
- }
+ if (freeable)
+ txh->txh_space_tooverwrite += space;
+ else
+ txh->txh_space_towrite += space;
+ if (bp)
+ txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+ dmu_tx_count_twig(txh, dn, parent, level + 1,
+ blkid >> epbs, freeable, history);
}
/* ARGSUSED */
@@ -213,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
max_ibs = DN_MAX_INDBLKSHIFT;
if (dn) {
- dmu_buf_impl_t *last[DN_MAX_LEVELS];
+ uint64_t history[DN_MAX_LEVELS];
int nlvls = dn->dn_nlevels;
int delta;
@@ -221,7 +234,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* For i/o error checking, read the first and last level-0
* blocks (if they are not aligned), and all the level-1 blocks.
*/
-
if (dn->dn_maxblkid == 0) {
delta = dn->dn_datablksz;
start = (off < dn->dn_datablksz) ? 0 : 1;
@@ -247,7 +259,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
/* last level-0 block */
end = (off+len-1) >> dn->dn_datablkshift;
- if (end != start &&
+ if (end != start && end <= dn->dn_maxblkid &&
P2PHASE(off+len, dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(zio, dn, 0, end);
if (err)
@@ -290,29 +302,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* If this write is not off the end of the file
* we need to account for overwrites/unref.
*/
- if (start <= dn->dn_maxblkid)
- bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ if (start <= dn->dn_maxblkid) {
+ for (int l = 0; l < DN_MAX_LEVELS; l++)
+ history[l] = -1ULL;
+ }
while (start <= dn->dn_maxblkid) {
- spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold_level(dn, 0, start, FTAG);
+ err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
- if (db->db_blkptr && dsl_dataset_block_freeable(ds,
- db->db_blkptr->blk_birth)) {
- dprintf_bp(db->db_blkptr, "can free old%s", "");
- txh->txh_space_tooverwrite += dn->dn_datablksz;
- txh->txh_space_tounref += dn->dn_datablksz;
- dmu_tx_count_indirects(txh, db, TRUE, last);
- } else {
- txh->txh_space_towrite += dn->dn_datablksz;
- if (db->db_blkptr)
- txh->txh_space_tounref +=
- bp_get_dasize(spa, db->db_blkptr);
- dmu_tx_count_indirects(txh, db, FALSE, last);
+
+ if (err) {
+ txh->txh_tx->tx_err = err;
+ return;
}
+
+ dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+ history);
dbuf_rele(db, FTAG);
if (++start > end) {
/*
@@ -377,13 +384,13 @@ static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
dnode_t *dn = txh->txh_dnode;
- dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+ dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
uint64_t space = mdn->dn_datablksz +
((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
if (dn && dn->dn_dbuf->db_blkptr &&
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_dbuf->db_blkptr->blk_birth)) {
+ dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
txh->txh_space_tooverwrite += space;
txh->txh_space_tounref += space;
} else {
@@ -428,7 +435,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
* The struct_rwlock protects us against dn_nlevels
* changing, in case (against all odds) we manage to dirty &
* sync out the changes after we check for being dirty.
- * Also, dbuf_hold_level() wants us to have the struct_rwlock.
+ * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -458,9 +465,9 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkptr_t *bp = dn->dn_phys->dn_blkptr;
ASSERT3U(blkid + i, <, dn->dn_nblkptr);
bp += blkid + i;
- if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+ if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
- space += bp_get_dasize(spa, bp);
+ space += bp_get_dsize(spa, bp);
}
unref += BP_GET_ASIZE(bp);
}
@@ -516,14 +523,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks);
- dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
-
- txh->txh_memory_tohold += dbuf->db.db_size;
- if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
- txh->txh_tx->tx_err = E2BIG;
- dbuf_rele(dbuf, FTAG);
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+ if (err) {
+ txh->txh_tx->tx_err = err;
break;
}
+
+ txh->txh_memory_tohold += dbuf->db.db_size;
+
+ /*
+ * We don't check memory_tohold against DMU_MAX_ACCESS because
+ * memory_tohold is an over-estimation (especially the >L1
+ * indirect blocks), so it could fail. Callers should have
+ * already verified that they will not be holding too much
+ * memory.
+ */
+
err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
if (err != 0) {
txh->txh_tx->tx_err = err;
@@ -535,9 +550,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
bp += blkoff;
for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+ if (dsl_dataset_block_freeable(ds, &bp[i],
+ bp[i].blk_birth)) {
dprintf_bp(&bp[i], "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
+ space += bp_get_dsize(spa, &bp[i]);
}
unref += BP_GET_ASIZE(bp);
}
@@ -582,6 +598,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
if (len != DMU_OBJECT_END)
dmu_tx_count_write(txh, off+len, 1);
+ dmu_tx_count_dnode(txh);
+
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
@@ -624,7 +642,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
}
}
- dmu_tx_count_dnode(txh);
dmu_tx_count_free(txh, off, len);
}
@@ -674,6 +691,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
* the size will change between now and the dbuf dirty call.
*/
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ &dn->dn_phys->dn_blkptr[0],
dn->dn_phys->dn_blkptr[0].blk_birth)) {
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
} else {
@@ -689,7 +707,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
* access the name in this fat-zap so that we'll check
* for i/o errors to the leaf blocks, etc.
*/
- err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ err = zap_lookup(dn->dn_objset, dn->dn_object, name,
8, 0, NULL);
if (err == EIO) {
tx->tx_err = err;
@@ -697,7 +715,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
}
}
- err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+ err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
/*
@@ -769,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
{
dmu_tx_hold_t *txh;
int match_object = FALSE, match_offset = FALSE;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(tx->tx_txg != 0);
- ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
ASSERT3U(dn->dn_object, ==, db->db.db_object);
- if (tx->tx_anyobj)
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
return;
+ }
/* XXX No checking on the meta dnode for now */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
return;
+ }
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
@@ -809,10 +833,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
match_offset = TRUE;
/*
* We will let this hold work for the bonus
- * buffer so that we don't need to hold it
- * when creating a new object.
+ * or spill buffer so that we don't need to
+ * hold it when creating a new object.
*/
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID ||
+ blkid == DMU_SPILL_BLKID)
match_offset = TRUE;
/*
* They might have to increase nlevels,
@@ -833,8 +858,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
+ case THT_SPILL:
+ if (blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ break;
case THT_BONUS:
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
match_offset = TRUE;
break;
case THT_ZAP:
@@ -847,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
ASSERT(!"bad txh_type");
}
}
- if (match_object && match_offset)
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
return;
+ }
}
+ DB_DNODE_EXIT(db);
panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
(u_longlong_t)db->db.db_object, db->db_level,
(u_longlong_t)db->db_blkid);
@@ -932,7 +964,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
* assume that we won't be able to free or overwrite anything.
*/
if (tx->tx_objset &&
- dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
tx->tx_lastsnap_txg) {
towrite += tooverwrite;
tooverwrite = tofree = 0;
@@ -1113,8 +1145,13 @@ dmu_tx_commit(dmu_tx_t *tx)
if (tx->tx_tempreserve_cookie)
dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+ if (!list_is_empty(&tx->tx_callbacks))
+ txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
+
+ list_destroy(&tx->tx_callbacks);
list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
@@ -1143,6 +1180,14 @@ dmu_tx_abort(dmu_tx_t *tx)
if (dn != NULL)
dnode_rele(dn, tx);
}
+
+ /*
+ * Call any registered callbacks with an error code.
+ */
+ if (!list_is_empty(&tx->tx_callbacks))
+ dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+ list_destroy(&tx->tx_callbacks);
list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
refcount_destroy_many(&tx->tx_space_written,
@@ -1159,3 +1204,179 @@ dmu_tx_get_txg(dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
return (tx->tx_txg);
}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+ dmu_tx_callback_t *dcb;
+
+ dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+ dcb->dcb_func = func;
+ dcb->dcb_data = data;
+
+ list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+ dmu_tx_callback_t *dcb;
+
+ while (dcb = list_head(cb_list)) {
+ list_remove(cb_list, dcb);
+ dcb->dcb_func(dcb->dcb_data, error);
+ kmem_free(dcb, sizeof (dmu_tx_callback_t));
+ }
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed. If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+ int i;
+
+ if (!sa->sa_need_attr_registration)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (!sa->sa_attr_table[i].sa_registered) {
+ if (sa->sa_reg_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ else
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ }
+ }
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_hold_t *txh;
+ blkptr_t *bp;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+
+ dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ return;
+
+ /* If blkptr doesn't exist then add space to towrite */
+ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref = 0;
+ } else {
+ bp = &dn->dn_phys->dn_spill;
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ bp, bp->blk_birth))
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ if (bp->blk_birth)
+ txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ }
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+ else {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ return;
+
+ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+ THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function. It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+ uint64_t object;
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ ASSERT(hdl != NULL);
+
+ object = sa_handle_object(hdl);
+
+ dmu_tx_hold_bonus(tx, object);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ } else {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index acf628453571..b5ca66628f80 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -244,7 +244,7 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
break;
}
zs->zst_ph_offset = prefetch_tail;
- zs->zst_last = LBOLT;
+ zs->zst_last = ddi_get_lbolt();
}
void
@@ -405,6 +405,7 @@ top:
rc = 1;
goto out;
}
+
if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -432,6 +433,7 @@ top:
rc = 1;
goto out;
}
+
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -462,6 +464,7 @@ top:
rc = 1;
goto out;
}
+
if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
@@ -481,6 +484,7 @@ top:
rc = 1;
goto out;
}
+
if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
@@ -603,7 +607,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf)
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
- if (((LBOLT - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
break;
}
@@ -734,7 +738,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
newstream->zst_cap = zst.zst_len;
newstream->zst_direction = ZFETCH_FORWARD;
- newstream->zst_last = LBOLT;
+ newstream->zst_last = ddi_get_lbolt();
mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index f9661d62d93e..b43035bb2e23 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -39,19 +38,35 @@
static int free_range_compar(const void *node1, const void *node2);
static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define DNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define DNODE_STAT_ADD(stat) /* nothing */
+#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+#ifdef sun
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
- int i;
dnode_t *dn = arg;
- bzero(dn, sizeof (dnode_t));
+ int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -60,8 +75,18 @@ dnode_cons(void *arg, void *unused, int kmflag)
refcount_create(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
+ list_link_init(&dn->dn_dirty_link[i]);
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
@@ -70,9 +95,28 @@ dnode_cons(void *arg, void *unused, int kmflag)
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
+ dn->dn_moved = 0;
+ POINTER_INVALIDATE(&dn->dn_objset);
return (0);
}
@@ -89,27 +133,56 @@ dnode_dest(void *arg, void *unused)
cv_destroy(&dn->dn_notxholds);
refcount_destroy(&dn->dn_holds);
refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
avl_destroy(&dn->dn_ranges[i]);
list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
}
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_free_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT3U(dn->dn_dirtyctx, ==, 0);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT3U(dn->dn_oldused, ==, 0);
+ ASSERT3U(dn->dn_oldflags, ==, 0);
+ ASSERT3U(dn->dn_olduid, ==, 0);
+ ASSERT3U(dn->dn_oldgid, ==, 0);
+ ASSERT3U(dn->dn_newuid, ==, 0);
+ ASSERT3U(dn->dn_newgid, ==, 0);
+ ASSERT3U(dn->dn_id_flags, ==, 0);
+
+ ASSERT3U(dn->dn_dbufs_count, ==, 0);
list_destroy(&dn->dn_dbufs);
}
void
dnode_init(void)
{
+ ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t",
sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(dnode_cache, dnode_move);
}
void
dnode_fini(void)
{
kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
}
@@ -121,6 +194,7 @@ dnode_verify(dnode_t *dn)
ASSERT(dn->dn_phys);
ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
@@ -210,6 +284,11 @@ dnode_byteswap(dnode_phys_t *dnp)
ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
}
+
+ /* Swap SPILL block if we have one */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
}
void
@@ -258,6 +337,27 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_bonustype = newtype;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ dnode_setdirty(dn, tx);
+ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -272,18 +372,30 @@ dnode_setdblksz(dnode_t *dn, int size)
}
static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
- uint64_t object)
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
- dn->dn_objset = os;
+ ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
dn->dn_object = object;
dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
dn->dn_phys = dnp;
- if (dnp->dn_datablkszsec)
+ if (dnp->dn_datablkszsec) {
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
dn->dn_indblkshift = dnp->dn_indblkshift;
dn->dn_nlevels = dnp->dn_nlevels;
dn->dn_type = dnp->dn_type;
@@ -293,49 +405,71 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_maxblkid = dnp->dn_maxblkid;
+ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+ dn->dn_id_flags = 0;
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
mutex_enter(&os->os_lock);
list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning dn_objset makes the
+ * dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
static void
dnode_destroy(dnode_t *dn)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
-#ifdef ZFS_DEBUG
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
- ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
- }
- ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
- ASSERT(dn->dn_oldphys == NULL);
+ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
list_remove(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
- if (dn->dn_dirtyctx_firstset) {
+ /* the dnode can no longer move, so we can release the handle */
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ if (dn->dn_dirtyctx_firstset != NULL) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
- dmu_zfetch_rele(&dn->dn_zfetch);
- if (dn->dn_bonus) {
+ if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
}
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
}
@@ -367,6 +501,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT(ot != DMU_OT_NONE);
ASSERT3U(ot, <, DMU_OT_NUMTYPES);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
@@ -379,9 +514,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -392,7 +530,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
- dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ dn->dn_nblkptr = 1;
+ else
+ dn->dn_nblkptr = 1 +
+ ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -406,10 +548,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
}
dn->dn_allocated_txg = tx->tx_txg;
+ dn->dn_id_flags = 0;
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -425,13 +569,16 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
- (bonustype != DMU_OT_NONE && bonuslen != 0));
+ (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
+ dn->dn_id_flags = 0;
+
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_setdirty(dn, tx);
if (dn->dn_datablksz != blocksize) {
@@ -444,9 +591,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
}
if (dn->dn_bonuslen != bonuslen)
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ nblkptr = 1;
+ else
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (dn->dn_bonustype != bonustype)
+ dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
rw_exit(&dn->dn_struct_rwlock);
/* change type */
@@ -472,9 +629,306 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
+#ifdef DNODE_STATS
+static struct {
+ uint64_t dms_dnode_invalid;
+ uint64_t dms_dnode_recheck1;
+ uint64_t dms_dnode_recheck2;
+ uint64_t dms_dnode_special;
+ uint64_t dms_dnode_handle;
+ uint64_t dms_dnode_rwlock;
+ uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+ refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(list_is_empty(&ndn->dn_dbufs));
+ list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+ ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+ ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ odn->dn_zfetch.zf_dnode = NULL;
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_ranges[i].avl_root = NULL;
+ odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef sun
+#ifdef _KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = odn->dn_dbufs_count;
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == ndn->dn_dbufs_count);
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+#endif /* sun */
+
void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
{
+ dnode_t *dn = dnh->dnh_dnode;
+
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
@@ -483,13 +937,19 @@ dnode_special_close(dnode_t *dn)
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+ dnh->dnh_dnode = dn;
+ zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
return (dn);
}
@@ -497,34 +957,43 @@ dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
static void
dnode_buf_pageout(dmu_buf_t *db, void *arg)
{
- dnode_t **children_dnodes = arg;
+ dnode_children_t *children_dnodes = arg;
int i;
int epb = db->db_size >> DNODE_SHIFT;
+ ASSERT(epb == children_dnodes->dnc_count);
+
for (i = 0; i < epb; i++) {
- dnode_t *dn = children_dnodes[i];
- int n;
+ dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ dnode_t *dn;
- if (dn == NULL)
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (dnh->dnh_dnode == NULL) {
+ zrl_destroy(&dnh->dnh_zrlock);
continue;
-#ifdef ZFS_DEBUG
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
- * it wouldn't be eligable for eviction and this function
+ * it wouldn't be eligible for eviction and this function
* would not have been called.
*/
ASSERT(refcount_is_zero(&dn->dn_holds));
- ASSERT(list_head(&dn->dn_dbufs) == NULL);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- for (n = 0; n < TXG_SIZE; n++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
- children_dnodes[i] = NULL;
- dnode_destroy(dn);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
}
/*
@@ -534,7 +1003,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
void *tag, dnode_t **dnp)
{
int epb, idx, err;
@@ -543,17 +1012,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_t **children_dnodes;
+ dnode_children_t *children_dnodes;
+ dnode_handle_t *dnh;
/*
* If you are holding the spa config lock as writer, you shouldn't
- * be asking the DMU to do *anything*.
+ * be asking the DMU to do *anything* unless it's the root pool
+ * which may require us to read from the root filesystem while
+ * holding some (not all) of the locks as writer.
*/
- ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+ (spa_is_root(os->os_spa) &&
+ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
- os->os_userused_dnode : os->os_groupused_dnode;
+ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
return (ENOENT);
type = dn->dn_type;
@@ -570,7 +1044,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
- mdn = os->os_meta_dnode;
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
DNODE_VERIFY(mdn);
@@ -597,26 +1072,39 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
idx = object & (epb-1);
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- dnode_t **winner;
- children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
- KM_SLEEP);
+ int i;
+ dnode_children_t *winner;
+ children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ children_dnodes->dnc_count = epb;
+ dnh = &children_dnodes->dnc_children[0];
+ for (i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+ dnh[i].dnh_dnode = NULL;
+ }
if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
dnode_buf_pageout)) {
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
+ ASSERT(children_dnodes->dnc_count == epb);
- if ((dn = children_dnodes[idx]) == NULL) {
- dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ if ((dn = dnh->dnh_dnode) == NULL) {
+ dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, dnp, db, object);
- winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ dn = dnode_create(os, phys, db, object, dnh);
+ winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
if (winner != NULL) {
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
dn = winner;
}
}
@@ -626,15 +1114,18 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || dn->dn_oldphys))) {
+ (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
+ zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dn);
+ dbuf_add_ref(db, dnh);
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
@@ -649,7 +1140,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
* Return held dnode if the object is allocated, NULL if not.
*/
int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
@@ -676,19 +1167,43 @@ void
dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
- if (refs == 0 && dn->dn_dbuf)
- dbuf_rele(dn->dn_dbuf, dn);
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ dbuf_rele(db, dnh);
+ }
}
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
@@ -701,10 +1216,15 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
#ifdef ZFS_DEBUG
mutex_enter(&dn->dn_mtx);
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
- /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
mutex_exit(&dn->dn_mtx);
#endif
+ /*
+ * Determine old uid/gid when necessary
+ */
+ dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
mutex_enter(&os->os_lock);
/*
@@ -719,6 +1239,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_datablksz != 0);
ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
@@ -734,7 +1255,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
- * dbuf maintaines a hold on the dnode. When the last child
+ * dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
@@ -813,7 +1334,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -857,7 +1379,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
int epbs, new_nlevels;
uint64_t sz;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(have_read ?
RW_READ_HELD(&dn->dn_struct_rwlock) :
@@ -904,6 +1426,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
@@ -914,7 +1437,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
@@ -1169,6 +1693,20 @@ out:
rw_exit(&dn->dn_struct_rwlock);
}
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+ int i;
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
@@ -1177,7 +1715,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
return (FALSE);
/*
@@ -1190,6 +1728,9 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
if (dn->dn_free_txg)
return (TRUE);
+ if (blkid == DMU_SPILL_BLKID)
+ return (dnode_spill_freed(dn));
+
range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
@@ -1247,7 +1788,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
if (space > 0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 3bf0c81d0992..32afe7d74735 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -77,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
if (child == NULL)
continue;
- ASSERT3P(child->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
if (child->db_parent && child->db_parent != dn->dn_dbuf) {
ASSERT(child->db_parent->db_level == db->db_level);
ASSERT(child->db_blkptr !=
@@ -120,7 +123,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
bzero(bp, sizeof (blkptr_t));
blocks_freed += 1;
@@ -136,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
int off, num;
int i, err, epbs;
uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
off = start - (db->db_blkid * 1<<epbs);
num = end - start + 1;
ASSERT3U(off, >=, 0);
ASSERT3U(num, >=, 0);
ASSERT3U(db->db_level, >, 0);
- ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
ASSERT(db->db_blkptr != NULL);
@@ -156,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
ASSERT(err == 0);
@@ -201,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
dbuf_rele(child, FTAG);
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -210,7 +217,7 @@ static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
@@ -228,10 +235,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- arc_release(db->db_buf, db);
+ dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
shift = (db->db_level - 1) * epbs;
dbstart = db->db_blkid << epbs;
start = blkid >> shift;
@@ -254,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
@@ -273,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
}
dbuf_rele(subdb, FTAG);
}
+ DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
@@ -376,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
- ASSERT3P(db->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
@@ -424,6 +439,9 @@ dnode_undirty_dbufs(list_t *list)
dmu_buf_impl_t *db = dr->dr_dbuf;
uint64_t txg = dr->dr_txg;
+ if (db->db_level != 0)
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
mutex_enter(&db->db_mtx);
/* XXX - use dbuf_undirty()? */
list_remove(list, dr);
@@ -431,18 +449,15 @@ dnode_undirty_dbufs(list_t *list)
db->db_last_dirty = NULL;
db->db_dirtycnt -= 1;
if (db->db_level == 0) {
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_data == db->db_buf);
dbuf_unoverride(dr);
- mutex_exit(&db->db_mtx);
} else {
- mutex_exit(&db->db_mtx);
- dnode_undirty_dbufs(&dr->dt.di.dr_children);
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
}
}
@@ -493,6 +508,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
+ dn->dn_have_spill = B_FALSE;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -515,6 +531,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
static const dnode_phys_t zerodn = { 0 };
+ boolean_t kill_spill = B_FALSE;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -526,10 +543,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
- ASSERT(dn->dn_oldphys == NULL);
- dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
- *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+ dn->dn_oldflags = dn->dn_phys->dn_flags;
dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ mutex_exit(&dn->dn_mtx);
+ dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
} else {
/* Once we account for it, we should always account for it. */
ASSERT(!(dn->dn_phys->dn_flags &
@@ -560,6 +579,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+ avl_last(&dn->dn_ranges[txgoff]) ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -576,6 +596,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_bonuslen[txgoff] = 0;
}
+ if (dn->dn_next_bonustype[txgoff]) {
+ ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+ dn->dn_next_bonustype[txgoff] = 0;
+ }
+
+ /*
+ * We will either remove a spill block when a file is being removed
+ * or we have been asked to remove it.
+ */
+ if (dn->dn_rm_spillblk[txgoff] ||
+ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+ dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
+ if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ kill_spill = B_TRUE;
+ dn->dn_rm_spillblk[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -592,6 +630,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
+ if (kill_spill) {
+ (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
/* process all the "freed" ranges in the file */
while (rp = avl_last(&dn->dn_ranges[txgoff])) {
dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index ac9d67f671f6..19b663e3ec0a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu_objset.h>
@@ -38,16 +37,24 @@
#include <sys/zfs_ioctl.h>
#include <sys/spa.h>
#include <sys/zfs_znode.h>
-#include <sys/sunddi.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
static char *dsl_reaper = "the grim reaper";
static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
@@ -76,14 +83,14 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
}
void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
- dprintf_bp(bp, "born, ds=%p\n", ds);
+ dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
/* It could have been compressed away to nothing */
@@ -103,6 +110,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
return;
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
@@ -119,29 +127,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
}
int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
- dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+ boolean_t async)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
-
- ASSERT(pio != NULL);
- ASSERT(dmu_tx_is_syncing(tx));
- /* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
return (0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(bp->blk_birth <= tx->tx_txg);
+
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
ASSERT(used > 0);
if (ds == NULL) {
- int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-used, -compressed, -uncompressed, tx);
@@ -154,13 +159,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
- int err;
int64_t delta;
- dprintf_bp(bp, "freeing: %s", "");
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
@@ -176,7 +178,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
mutex_exit(&ds->ds_dir->dd_lock);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ if (async) {
+ /*
+ * We are here as part of zio's write done callback,
+ * which means we're a zio interrupt thread. We can't
+ * call dsl_deadlist_insert() now because it may block
+ * waiting for I/O. Instead, put bp on the deferred
+ * queue and let dsl_pool_sync() finish the job.
+ */
+ bplist_append(&ds->ds_pending_deadlist, bp);
+ } else {
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+ }
ASSERT3U(ds->ds_prev->ds_object, ==,
ds->ds_phys->ds_prev_snap_obj);
ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -189,7 +202,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
ds->ds_prev->ds_phys->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
- if (bp->blk_birth > ds->ds_origin_txg) {
+ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -230,9 +243,15 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
}
boolean_t
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+ uint64_t blk_birth)
{
- return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+ if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
+ return (B_FALSE);
+
+ ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+
+ return (B_TRUE);
}
/* ARGSUSED */
@@ -243,19 +262,23 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
- dprintf_ds(ds, "evicting %s\n", "");
-
unique_remove(ds->ds_fsid_guid);
- if (ds->ds_user_ptr != NULL)
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ if (ds->ds_objset != NULL)
+ dmu_objset_evict(ds->ds_objset);
if (ds->ds_prev) {
dsl_dataset_drop_ref(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ if (db != NULL) {
+ dsl_deadlist_close(&ds->ds_deadlist);
+ } else {
+ ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
+ ASSERT(!ds->ds_deadlist.dl_oldfmt);
+ }
if (ds->ds_dir)
dsl_dir_close(ds->ds_dir, ds);
@@ -264,12 +287,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
if (mutex_owned(&ds->ds_lock))
mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
if (mutex_owned(&ds->ds_opening_lock))
mutex_exit(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_opening_lock);
- if (mutex_owned(&ds->ds_deadlist.bpl_lock))
- mutex_exit(&ds->ds_deadlist.bpl_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
@@ -329,6 +350,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
matchtype_t mt;
int err;
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_FIRST;
else
@@ -348,6 +371,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
int err;
+ dmu_object_info_t doi;
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
dsl_pool_sync_context(dp));
@@ -355,6 +379,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
if (err)
return (err);
+
+ /* Make sure dsobj has the correct object type. */
+ dmu_object_info_from_db(dbuf, &doi);
+ if (doi.doi_type != DMU_OT_DSL_DATASET)
+ return (EINVAL);
+
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
dsl_dataset_t *winner;
@@ -365,28 +395,27 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
ds->ds_phys = dbuf->db_data;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
- NULL);
rw_init(&ds->ds_rwlock, 0, 0, 0);
cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
- err = bplist_open(&ds->ds_deadlist,
+ bplist_create(&ds->ds_pending_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
+
if (err == 0) {
err = dsl_dir_open_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
}
if (err) {
- /*
- * we don't really need to close the blist if we
- * just opened it.
- */
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
@@ -399,21 +428,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
ds->ds_phys->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
-
- if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
- dsl_dataset_t *origin;
-
- err = dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj,
- FTAG, &origin);
- if (err == 0) {
- ds->ds_origin_txg =
- origin->ds_phys->ds_creation_txg;
- dsl_dataset_rele(origin, FTAG);
- }
+ } else {
+ if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+ err = dsl_dataset_get_snapname(ds);
+ if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+ err = zap_count(
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj,
+ &ds->ds_userrefs);
}
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
}
if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
@@ -449,13 +472,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_evict);
}
if (err || winner) {
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
if (ds->ds_prev)
dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
@@ -551,17 +575,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
}
int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
- dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
-
- ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
-
+ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
if (err)
return (err);
- if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
- dsl_dataset_rele(*dsp, owner);
+ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+ dsl_dataset_rele(*dsp, tag);
*dsp = NULL;
return (EBUSY);
}
@@ -628,18 +649,14 @@ out:
}
int
-dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold(name, owner, dsp);
+ int err = dsl_dataset_hold(name, tag, dsp);
if (err)
return (err);
- if ((*dsp)->ds_phys->ds_num_children > 0 &&
- !DS_MODE_IS_READONLY(flags)) {
- dsl_dataset_rele(*dsp, owner);
- return (EROFS);
- }
- if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
- dsl_dataset_rele(*dsp, owner);
+ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+ dsl_dataset_rele(*dsp, tag);
return (EBUSY);
}
return (0);
@@ -711,9 +728,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
}
void
-dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
{
- ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+ ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
(DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
mutex_enter(&ds->ds_lock);
@@ -724,20 +741,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
}
mutex_exit(&ds->ds_lock);
if (ds->ds_dbuf)
- dsl_dataset_drop_ref(ds, owner);
+ dsl_dataset_drop_ref(ds, tag);
else
- dsl_dataset_evict(ds->ds_dbuf, ds);
+ dsl_dataset_evict(NULL, ds);
}
boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
{
boolean_t gotit = FALSE;
mutex_enter(&ds->ds_lock);
if (ds->ds_owner == NULL &&
(!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
- ds->ds_owner = owner;
+ ds->ds_owner = tag;
if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
rw_exit(&ds->ds_rwlock);
gotit = TRUE;
@@ -788,10 +805,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- if (origin) {
+ if (origin == NULL) {
+ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+ } else {
+ dsl_dataset_t *ohds;
+
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
origin->ds_phys->ds_creation_txg;
@@ -807,6 +826,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+ dsl_dataset_rele(ohds, FTAG);
+
if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
if (origin->ds_phys->ds_next_clones_obj == 0) {
origin->ds_phys->ds_next_clones_obj =
@@ -820,6 +845,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_origin_obj = origin->ds_object;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones =
+ zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(mos,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+ }
}
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -852,6 +887,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dir_close(dd, FTAG);
+ /*
+ * If we are creating a clone, make sure we zero out any stale
+ * data from the origin snapshots zil header.
+ */
+ if (origin != NULL) {
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ dsl_dataset_dirty(ds, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
return (dsobj);
}
@@ -859,30 +909,29 @@ struct destroyarg {
dsl_sync_task_group_t *dstg;
char *snapname;
char *failed;
+ boolean_t defer;
};
static int
-dsl_snapshot_destroy_one(char *name, void *arg)
+dsl_snapshot_destroy_one(const char *name, void *arg)
{
struct destroyarg *da = arg;
dsl_dataset_t *ds;
- char *cp;
int err;
+ char *dsname;
- (void) strcat(name, "@");
- (void) strcat(name, da->snapname);
- err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- da->dstg, &ds);
- cp = strchr(name, '@');
- *cp = '\0';
+ dsname = kmem_asprintf("%s@%s", name, da->snapname);
+ err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
+ strfree(dsname);
if (err == 0) {
+ struct dsl_ds_destroyarg *dsda;
+
dsl_dataset_make_exclusive(ds, da->dstg);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
+ dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
+ dsda->ds = ds;
+ dsda->defer = da->defer;
dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ dsl_dataset_destroy_sync, dsda, da->dstg, 0);
} else if (err == ENOENT) {
err = 0;
} else {
@@ -896,7 +945,7 @@ dsl_snapshot_destroy_one(char *name, void *arg)
*/
#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
int
-dsl_snapshots_destroy(char *fsname, char *snapname)
+dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
{
int err;
struct destroyarg da;
@@ -909,6 +958,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
da.snapname = snapname;
da.failed = fsname;
+ da.defer = defer;
err = dmu_objset_find(fsname,
dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
@@ -918,7 +968,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
for (dst = list_head(&da.dstg->dstg_tasks); dst;
dst = list_next(&da.dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
+ struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
+ dsl_dataset_t *ds = dsda->ds;
+
/*
* Return the file system name that triggered the error
*/
@@ -926,7 +978,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
dsl_dataset_name(ds, fsname);
*strchr(fsname, '@') = '\0';
}
+ ASSERT3P(dsda->rm_origin, ==, NULL);
dsl_dataset_disown(ds, da.dstg);
+ kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
}
dsl_sync_task_group_destroy(da.dstg);
@@ -934,34 +988,94 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
return (err);
}
+static boolean_t
+dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+{
+ boolean_t might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds))
+ might_destroy = B_TRUE;
+ mutex_exit(&ds->ds_lock);
+
+ return (might_destroy);
+}
+
+/*
+ * If we're removing a clone, and these three conditions are true:
+ * 1) the clone's origin has no other children
+ * 2) the clone's origin has no user references
+ * 3) the clone's origin has been marked for deferred destruction
+ * Then, prepare to remove the origin as part of this sync task group.
+ */
+static int
+dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *origin = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(origin)) {
+ char *name;
+ int namelen;
+ int error;
+
+ namelen = dsl_dataset_namelen(origin) + 1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(origin, name);
+#ifdef _KERNEL
+ error = zfs_unmount_snap(name, NULL);
+ if (error) {
+ kmem_free(name, namelen);
+ return (error);
+ }
+#endif
+ error = dsl_dataset_own(name, B_TRUE, tag, &origin);
+ kmem_free(name, namelen);
+ if (error)
+ return (error);
+ dsda->rm_origin = origin;
+ dsl_dataset_make_exclusive(origin, tag);
+ }
+
+ return (0);
+}
+
/*
* ds must be opened as OWNER. On return (whether successful or not),
* ds will be closed and caller can no longer dereference it.
*/
int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
{
int err;
dsl_sync_task_group_t *dstg;
objset_t *os;
dsl_dir_t *dd;
uint64_t obj;
+ struct dsl_ds_destroyarg dsda = { 0 };
+ dsl_dataset_t dummy_ds = { 0 };
+
+ dsda.ds = ds;
if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
+ dsda.defer = defer;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, tag, 0);
+ &dsda, tag, 0);
+ ASSERT3P(dsda.rm_origin, ==, NULL);
+ goto out;
+ } else if (defer) {
+ err = EINVAL;
goto out;
}
dd = ds->ds_dir;
+ dummy_ds.ds_dir = dd;
+ dummy_ds.ds_object = ds->ds_object;
/*
* Check for errors and mark this ds as inconsistent, in
@@ -972,7 +1086,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
if (err)
goto out;
- err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ err = dmu_objset_from_ds(ds, &os);
if (err)
goto out;
@@ -988,11 +1102,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
*/
(void) dmu_free_object(os, obj);
}
+ if (err != ESRCH)
+ goto out;
+
+ /*
+ * Only the ZIL knows how to free log blocks.
+ */
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
/*
- * We need to sync out all in-flight IO before we try to evict
- * (the dataset evict func is trying to clear the cached entries
- * for this dataset in the ARC).
+ * Sync out all in-flight IO.
*/
txg_wait_synced(dd->dd_pool, 0);
@@ -1001,7 +1120,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
* context, the user space accounting should be zero.
*/
if (ds->ds_phys->ds_bp.blk_fill == 0 &&
- dmu_objset_userused_enabled(os->os)) {
+ dmu_objset_userused_enabled(os)) {
uint64_t count;
ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
@@ -1010,10 +1129,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
count == 0);
}
- dmu_objset_close(os);
- if (err != ESRCH)
- goto out;
-
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
rw_exit(&dd->dd_pool->dp_config_rwlock);
@@ -1021,30 +1136,48 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
if (err)
goto out;
- if (ds->ds_user_ptr) {
- /*
- * We need to sync out all in-flight IO before we try
- * to evict (the dataset evict func is trying to clear
- * the cached entries for this dataset in the ARC).
- */
- txg_wait_synced(dd->dd_pool, 0);
- }
-
/*
* Blow away the dsl_dir + head dataset.
*/
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
- dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, tag, 0);
- dsl_sync_task_create(dstg, dsl_dir_destroy_check,
- dsl_dir_destroy_sync, dd, FTAG, 0);
- err = dsl_sync_task_group_wait(dstg);
- dsl_sync_task_group_destroy(dstg);
+ /*
+ * If we're removing a clone, we might also need to remove its
+ * origin.
+ */
+ do {
+ dsda.need_prep = B_FALSE;
+ if (dsl_dir_is_clone(dd)) {
+ err = dsl_dataset_origin_rm_prep(&dsda, tag);
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ goto out;
+ }
+ }
+
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, &dsda, tag, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+
+ /*
+ * We could be racing against 'zfs release' or 'zfs destroy -d'
+ * on the origin snap, in which case we can get EBUSY if we
+ * needed to destroy the origin snap but were not ready to
+ * do so.
+ */
+ if (dsda.need_prep) {
+ ASSERT(err == EBUSY);
+ ASSERT(dsl_dir_is_clone(dd));
+ ASSERT(dsda.rm_origin == NULL);
+ }
+ } while (dsda.need_prep);
+
+ if (dsda.rm_origin != NULL)
+ dsl_dataset_disown(dsda.rm_origin, tag);
+
/* if it is successful, dsl_dir_destroy_sync will close the dd */
if (err)
dsl_dir_close(dd, FTAG);
@@ -1053,47 +1186,6 @@ out:
return (err);
}
-int
-dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
-{
- int err;
-
- ASSERT(ds->ds_owner);
-
- dsl_dataset_make_exclusive(ds, ds->ds_owner);
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
- ds, &ost, 0);
- /* drop exclusive access */
- mutex_enter(&ds->ds_lock);
- rw_exit(&ds->ds_rwlock);
- cv_broadcast(&ds->ds_exclusive_cv);
- mutex_exit(&ds->ds_lock);
- return (err);
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func)
-{
- void *old;
-
- mutex_enter(&ds->ds_lock);
- old = ds->ds_user_ptr;
- if (old == NULL) {
- ds->ds_user_ptr = p;
- ds->ds_user_evict_func = func;
- }
- mutex_exit(&ds->ds_lock);
- return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
- return (ds->ds_user_ptr);
-}
-
blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
@@ -1127,7 +1219,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
if (ds == NULL) /* this is the meta-objset */
return;
- ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_objset != NULL);
if (ds->ds_phys->ds_next_snap_obj != 0)
panic("dirtying snapshot!");
@@ -1154,62 +1246,51 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
uint64_t mrs_used;
uint64_t dlused, dlcomp, dluncomp;
- ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+ ASSERT(!dsl_dataset_is_snapshot(ds));
if (ds->ds_phys->ds_prev_snap_obj != 0)
mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
else
mrs_used = 0;
- VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
- &dluncomp));
+ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
ASSERT3U(dlused, <=, mrs_used);
ds->ds_phys->ds_unique_bytes =
ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
- if (!DS_UNIQUE_IS_ACCURATE(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
}
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
-{
- if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
- dsl_dataset_recalc_head_uniq(ds);
-
- return (ds->ds_phys->ds_unique_bytes);
-}
-
struct killarg {
dsl_dataset_t *ds;
- zio_t *zio;
dmu_tx_t *tx;
};
/* ARGSUSED */
static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
+ dmu_tx_t *tx = ka->tx;
if (bp == NULL)
return (0);
- if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
- (zb->zb_object != 0 && dnp == NULL)) {
+ if (zb->zb_level == ZB_ZIL_LEVEL) {
+ ASSERT(zilog != NULL);
/*
* It's a block in the intent log. It has no
* accounting, so just free it.
*/
- VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
- ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
+ ASSERT(zilog == NULL);
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
- (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
return (0);
@@ -1217,143 +1298,6 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
/* ARGSUSED */
static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- dmu_objset_type_t *ost = arg2;
-
- /*
- * We can only roll back to emptyness if it is a ZPL objset.
- */
- if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
- return (EINVAL);
-
- /*
- * This must not be a snapshot.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /*
- * If we made changes this txg, traverse_dataset won't find
- * them. Try again.
- */
- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
- return (EAGAIN);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- dmu_objset_type_t *ost = arg2;
- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
- if (ds->ds_user_ptr != NULL) {
- /*
- * We need to make sure that the objset_impl_t is reopened after
- * we do the rollback, otherwise it will have the wrong
- * objset_phys_t. Normally this would happen when this
- * dataset-open is closed, thus causing the
- * dataset to be immediately evicted. But when doing "zfs recv
- * -F", we reopen the objset before that, so that there is no
- * window where the dataset is closed and inconsistent.
- */
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
- }
-
- /* Transfer space that was freed since last snap back to the head. */
- {
- uint64_t used;
-
- VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
- ds->ds_origin_txg, UINT64_MAX, &used));
- dsl_dir_transfer_space(ds->ds_dir, used,
- DD_USED_SNAP, DD_USED_HEAD, tx);
- }
-
- /* Zero out the deadlist. */
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
-
- {
- /*
- * Free blkptrs that we gave birth to - this covers
- * claimed but not played log blocks too.
- */
- zio_t *zio;
- struct killarg ka;
-
- zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED);
- ka.ds = ds;
- ka.zio = zio;
- ka.tx = tx;
- (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- TRAVERSE_POST, kill_blkptr, &ka);
- (void) zio_wait(zio);
- }
-
- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
- if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
- /* Change our contents to that of the prev snapshot */
-
- ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
- ds->ds_prev->ds_phys->ds_used_bytes);
-
- ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
- ds->ds_phys->ds_used_bytes =
- ds->ds_prev->ds_phys->ds_used_bytes;
- ds->ds_phys->ds_compressed_bytes =
- ds->ds_prev->ds_phys->ds_compressed_bytes;
- ds->ds_phys->ds_uncompressed_bytes =
- ds->ds_prev->ds_phys->ds_uncompressed_bytes;
- ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_unique_bytes = 0;
- }
- } else {
- objset_impl_t *osi;
-
- ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
- ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
- ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
-
- bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
- ds->ds_phys->ds_flags = 0;
- ds->ds_phys->ds_unique_bytes = 0;
- if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
- SPA_VERSION_UNIQUE_ACCURATE)
- ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
- osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
- &ds->ds_phys->ds_bp, *ost, tx);
-#ifdef _KERNEL
- zfs_create_fs(&osi->os, kcred, NULL, tx);
-#endif
- }
-
- spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", ds->ds_object);
-}
-
-/* ARGSUSED */
-static int
dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
@@ -1368,7 +1312,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
if (ds->ds_prev != NULL &&
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
+ return (EBUSY);
/*
* This is really a dsl_dir thing, but check it here so that
@@ -1386,7 +1330,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1395,22 +1339,72 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
- spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
}
+static int
+dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
+ dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *ds_prev = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(ds_prev)) {
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ /*
+ * If we're not prepared to remove the origin, don't remove
+ * the clone either.
+ */
+ if (dsda->rm_origin == NULL) {
+ dsda->need_prep = B_TRUE;
+ return (EBUSY);
+ }
+
+ ndsda.ds = ds_prev;
+ ndsda.is_origin_rm = B_TRUE;
+ return (dsl_dataset_destroy_check(&ndsda, tag, tx));
+ }
+
+ /*
+ * If we're not going to remove the origin after all,
+ * undo the open context setup.
+ */
+ if (dsda->rm_origin != NULL) {
+ dsl_dataset_disown(dsda->rm_origin, tag);
+ dsda->rm_origin = NULL;
+ }
+
+ return (0);
+}
+
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
/* ARGSUSED */
int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
/* we have an owner hold, so noone else can destroy us */
ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
- /* Can't delete a branch point. */
- if (ds->ds_phys->ds_num_children > 1)
- return (EEXIST);
+ /*
+ * Only allow deferred destroy on pools that support it.
+ * NOTE: deferred destroy is only supported on snapshots.
+ */
+ if (dsda->defer) {
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ ASSERT(dsl_dataset_is_snapshot(ds));
+ return (0);
+ }
/*
* Can't delete a head dataset if there are snapshots of it.
@@ -1419,7 +1413,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
if (ds->ds_prev != NULL &&
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
+ return (EBUSY);
/*
* If we made changes this txg, traverse_dsl_dataset won't find
@@ -1428,6 +1422,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
return (EAGAIN);
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * If this snapshot has an elevated user reference count,
+ * we can't destroy it yet.
+ */
+ if (ds->ds_userrefs > 0 && !dsda->releasing)
+ return (EBUSY);
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Can't delete a branch point. However, if we're destroying
+ * a clone and removing its origin due to it having a user
+ * hold count of 0 and having been marked for deferred destroy,
+ * it's OK for the origin to have a single clone.
+ */
+ if (ds->ds_phys->ds_num_children >
+ (dsda->is_origin_rm ? 2 : 1)) {
+ mutex_exit(&ds->ds_lock);
+ return (EEXIST);
+ }
+ mutex_exit(&ds->ds_lock);
+ } else if (dsl_dir_is_clone(ds->ds_dir)) {
+ return (dsl_dataset_origin_check(dsda, arg2, tx));
+ }
+
/* XXX we should do some i/o error checking... */
return (0);
}
@@ -1500,24 +1519,132 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
}
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /*
+ * If it is the old version, dd_clones doesn't exist so we can't
+ * find the clones, but deadlist_remove_key() is a no-op so it
+ * doesn't matter.
+ */
+ if (ds->ds_dir->dd_phys->dd_clones == 0)
+ return;
+
+ for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ if (clone->ds_dir->dd_origin_txg > mintxg) {
+ dsl_deadlist_remove_key(&clone->ds_deadlist,
+ mintxg, tx);
+ dsl_dataset_remove_clones_key(clone, mintxg, tx);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+}
+
+struct process_old_arg {
+ dsl_dataset_t *ds;
+ dsl_dataset_t *ds_prev;
+ boolean_t after_branch_point;
+ zio_t *pio;
+ uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct process_old_arg *poa = arg;
+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+ if (poa->ds_prev && !poa->after_branch_point &&
+ bp->blk_birth >
+ poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+ poa->ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dsize_sync(dp->dp_spa, bp);
+ }
+ } else {
+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+ poa->comp += BP_GET_PSIZE(bp);
+ poa->uncomp += BP_GET_UCSIZE(bp);
+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+ }
+ return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+ struct process_old_arg poa = { 0 };
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(ds->ds_deadlist.dl_oldfmt);
+ ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+ poa.ds = ds;
+ poa.ds_prev = ds_prev;
+ poa.after_branch_point = after_branch_point;
+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+ process_old_cb, &poa, tx));
+ VERIFY3U(zio_wait(poa.pio), ==, 0);
+ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -poa.used, -poa.comp, -poa.uncomp, tx);
+
+ /* swap next's deadlist to our deadlist */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_close(&ds_next->ds_deadlist);
+ SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
+ ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj);
+}
+
void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
- zio_t *zio;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
int err;
int after_branch_point = FALSE;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
dsl_dataset_t *ds_prev = NULL;
+ boolean_t wont_destroy;
uint64_t obj;
- ASSERT(ds->ds_owner);
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ wont_destroy = (dsda->defer &&
+ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
+
+ ASSERT(ds->ds_owner || wont_destroy);
+ ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ if (wont_destroy) {
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ return;
+ }
+
/* signal any waiters that this dataset is going away */
mutex_enter(&ds->ds_lock);
ds->ds_owner = dsl_reaper;
@@ -1526,14 +1653,21 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
/* Remove our reservation */
if (ds->ds_reserved != 0) {
- uint64_t val = 0;
- dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+ dsl_prop_setarg_t psa;
+ uint64_t value = 0;
+
+ dsl_prop_setarg_init_uint64(&psa, "refreservation",
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ &value);
+ psa.psa_effective_value = 0; /* predict default value */
+
+ dsl_dataset_set_reservation_sync(ds, &psa, tx);
ASSERT3U(ds->ds_reserved, ==, 0);
}
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
- dsl_pool_ds_destroyed(ds, tx);
+ dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
@@ -1562,26 +1696,36 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
/* This clone is toast. */
ASSERT(ds_prev->ds_phys->ds_num_children > 1);
ds_prev->ds_phys->ds_num_children--;
+
+ /*
+ * If the clone's origin has no other clones, no
+ * user holds, and has been marked for deferred
+ * deletion, then we should have done the necessary
+ * destroy setup for it.
+ */
+ if (ds_prev->ds_phys->ds_num_children == 1 &&
+ ds_prev->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds_prev)) {
+ ASSERT3P(dsda->rm_origin, !=, NULL);
+ } else {
+ ASSERT3P(dsda->rm_origin, ==, NULL);
+ }
} else if (!after_branch_point) {
ds_prev->ds_phys->ds_next_snap_obj =
ds->ds_phys->ds_next_snap_obj;
}
}
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- blkptr_t bp;
+ if (dsl_dataset_is_snapshot(ds)) {
dsl_dataset_t *ds_next;
- uint64_t itor = 0;
uint64_t old_unique;
- int64_t used = 0, compressed = 0, uncompressed = 0;
+ uint64_t used = 0, comp = 0, uncomp = 0;
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
- old_unique = dsl_dataset_unique(ds_next);
+ old_unique = ds_next->ds_phys->ds_unique_bytes;
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
ds_next->ds_phys->ds_prev_snap_obj =
@@ -1591,53 +1735,49 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
- /*
- * Transfer to our deadlist (which will become next's
- * new deadlist) any entries from next's current
- * deadlist which were born before prev, and free the
- * other entries.
- *
- * XXX we're doing this long task with the config lock held
- */
- while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
- if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
- &bp, tx));
- if (ds_prev && !after_branch_point &&
- bp.blk_birth >
- ds_prev->ds_phys->ds_prev_snap_txg) {
- ds_prev->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- } else {
- used += bp_get_dasize(dp->dp_spa, &bp);
- compressed += BP_GET_PSIZE(&bp);
- uncompressed += BP_GET_UCSIZE(&bp);
- /* XXX check return value? */
- (void) dsl_free(zio, dp, tx->tx_txg,
- &bp, NULL, NULL, ARC_NOWAIT);
- }
- }
- ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ if (ds_next->ds_deadlist.dl_oldfmt) {
+ process_old_deadlist(ds, ds_prev, ds_next,
+ after_branch_point, tx);
+ } else {
+ /* Adjust prev's unique space. */
+ if (ds_prev && !after_branch_point) {
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds_prev->ds_phys->ds_prev_snap_txg,
+ ds->ds_phys->ds_prev_snap_txg,
+ &used, &comp, &uncomp);
+ ds_prev->ds_phys->ds_unique_bytes += used;
+ }
- /* change snapused */
- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
- -used, -compressed, -uncompressed, tx);
+ /* Adjust snapused. */
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -comp, -uncomp, tx);
+
+ /* Move blocks to be freed to pool's free list. */
+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+ tx);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+ DD_USED_HEAD, used, comp, uncomp, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
+
+ /* Merge our deadlist into next's and free it. */
+ dsl_deadlist_merge(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_deadlist_obj, tx);
+ }
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
- /* free next's deadlist */
- bplist_close(&ds_next->ds_deadlist);
- bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+ /* Collapse range in clone heads */
+ dsl_dataset_remove_clones_key(ds,
+ ds->ds_phys->ds_creation_txg, tx);
- /* set next's deadlist to our deadlist */
- bplist_close(&ds->ds_deadlist);
- ds_next->ds_phys->ds_deadlist_obj =
- ds->ds_phys->ds_deadlist_obj;
- VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj));
- ds->ds_phys->ds_deadlist_obj = 0;
+ if (dsl_dataset_is_snapshot(ds_next)) {
+ dsl_dataset_t *ds_nextnext;
- if (ds_next->ds_phys->ds_next_snap_obj != 0) {
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
@@ -1646,25 +1786,27 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
- *
- * XXX we're doing this long task with the
- * config lock held
*/
- dsl_dataset_t *ds_after_next;
- uint64_t space;
-
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds_next->ds_phys->ds_next_snap_obj,
- FTAG, &ds_after_next));
-
- VERIFY(0 ==
- bplist_space_birthrange(&ds_after_next->ds_deadlist,
+ FTAG, &ds_nextnext));
+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
ds->ds_phys->ds_prev_snap_txg,
- ds->ds_phys->ds_creation_txg, &space));
- ds_next->ds_phys->ds_unique_bytes += space;
-
- dsl_dataset_rele(ds_after_next, FTAG);
+ ds->ds_phys->ds_creation_txg,
+ &used, &comp, &uncomp);
+ ds_next->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_rele(ds_nextnext, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+ /* Collapse range in this head. */
+ dsl_dataset_t *hds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &hds));
+ dsl_deadlist_remove_key(&hds->ds_deadlist,
+ ds->ds_phys->ds_creation_txg, tx);
+ dsl_dataset_rele(hds, FTAG);
+
} else {
ASSERT3P(ds_next->ds_prev, ==, ds);
dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
@@ -1704,9 +1846,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
*/
struct killarg ka;
- ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
ds->ds_phys->ds_deadlist_obj = 0;
/*
@@ -1717,17 +1858,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
* freed all the objects in open context.
*/
ka.ds = ds;
- ka.zio = zio;
ka.tx = tx;
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0);
+
+ if (ds->ds_prev != NULL) {
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(mos,
+ ds->ds_prev->ds_dir->dd_phys->dd_clones,
+ ds->ds_object, tx));
+ }
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = ds_prev = NULL;
+ }
}
- err = zio_wait(zio);
- ASSERT3U(err, ==, 0);
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
/* Erase the link in the dir */
@@ -1762,8 +1918,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_rele(ds_prev, FTAG);
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
if (ds->ds_phys->ds_next_clones_obj != 0) {
uint64_t count;
@@ -1774,10 +1930,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
}
if (ds->ds_phys->ds_props_obj != 0)
VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+ if (ds->ds_phys->ds_userrefs_obj != 0)
+ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
dsl_dir_close(ds->ds_dir, ds);
ds->ds_dir = NULL;
dsl_dataset_drain_refs(ds, tag);
VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+ if (dsda->rm_origin) {
+ /*
+ * Remove the origin of the clone we just destroyed.
+ */
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ ndsda.ds = dsda->rm_origin;
+ dsl_dataset_destroy_sync(&ndsda, tag, tx);
+ }
}
static int
@@ -1793,8 +1961,9 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
- asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
- if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+ asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
/*
@@ -1807,7 +1976,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
int
dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1848,7 +2016,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *snapname = arg2;
@@ -1919,25 +2087,31 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
- int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ int64_t delta;
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
- add, 0, 0, tx);
+ delta, 0, 0, tx);
}
- bplist_close(&ds->ds_deadlist);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
+ ds->ds_dir->dd_myname, snapname, dsobj,
+ ds->ds_phys->ds_prev_snap_txg);
+ ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
+ UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, tx);
+
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
ds->ds_phys->ds_prev_snap_obj = dsobj;
ds->ds_phys->ds_prev_snap_txg = crtxg;
ds->ds_phys->ds_unique_bytes = 0;
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
- dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
snapname, 8, 1, &dsobj, tx);
ASSERT(err == 0);
@@ -1947,9 +2121,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == dsl_dataset_get_ref(dp,
ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
- dsl_pool_ds_snapshotted(ds, tx);
+ dsl_scan_ds_snapshotted(ds, tx);
- spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
"dataset = %llu", dsobj);
}
@@ -1957,7 +2133,7 @@ void
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_objset != NULL);
ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
/*
@@ -1968,7 +2144,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dsl_dir_dirty(ds->ds_dir, tx);
- dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+ dmu_objset_sync(ds->ds_objset, zio, tx);
}
void
@@ -1992,6 +2168,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
ds->ds_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
ds->ds_phys->ds_guid);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+ ds->ds_phys->ds_unique_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+ ds->ds_object);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+ ds->ds_userrefs);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+ DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
if (ds->ds_phys->ds_next_snap_obj) {
/*
@@ -2075,8 +2259,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
if (ds->ds_prev == NULL)
return (B_FALSE);
if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (B_TRUE);
+ ds->ds_prev->ds_phys->ds_creation_txg) {
+ objset_t *os, *os_prev;
+ /*
+ * It may be that only the ZIL differs, because it was
+ * reset in the head. Don't count that as being
+ * modified.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0)
+ return (B_TRUE);
+ if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+ return (B_TRUE);
+ return (bcmp(&os->os_phys->os_meta_dnode,
+ &os_prev->os_phys->os_meta_dnode,
+ sizeof (os->os_phys->os_meta_dnode)) != 0);
+ }
return (B_FALSE);
}
@@ -2113,8 +2310,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *newsnapname = arg2;
@@ -2138,8 +2334,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
ds->ds_snapname, 8, 1, &ds->ds_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
dsl_dataset_rele(hds, FTAG);
}
@@ -2151,43 +2347,36 @@ struct renamesnaparg {
};
static int
-dsl_snapshot_rename_one(char *name, void *arg)
+dsl_snapshot_rename_one(const char *name, void *arg)
{
struct renamesnaparg *ra = arg;
dsl_dataset_t *ds = NULL;
- char *cp;
+ char *snapname;
int err;
- cp = name + strlen(name);
- *cp = '@';
- (void) strcpy(cp + 1, ra->oldsnap);
+ snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
+ (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
/*
* For recursive snapshot renames the parent won't be changing
* so we just pass name for both the to/from argument.
*/
- err = zfs_secpolicy_rename_perms(name, name, CRED());
- if (err == ENOENT) {
- return (0);
- } else if (err) {
- (void) strcpy(ra->failed, name);
- return (err);
+ err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
+ if (err != 0) {
+ strfree(snapname);
+ return (err == ENOENT ? 0 : err);
}
#ifdef _KERNEL
/*
* For all filesystems undergoing rename, we'll need to unmount it.
*/
- (void) zfs_unmount_snap(name, NULL);
+ (void) zfs_unmount_snap(snapname, NULL);
#endif
- err = dsl_dataset_hold(name, ra->dstg, &ds);
- *cp = '\0';
- if (err == ENOENT) {
- return (0);
- } else if (err) {
- (void) strcpy(ra->failed, name);
- return (err);
- }
+ err = dsl_dataset_hold(snapname, ra->dstg, &ds);
+ strfree(snapname);
+ if (err != 0)
+ return (err == ENOENT ? 0 : err);
dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -2203,7 +2392,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
dsl_sync_task_t *dst;
spa_t *spa;
char *cp, *fsname = spa_strdup(oldname);
- int len = strlen(oldname);
+ int len = strlen(oldname) + 1;
/* truncate the snapshot name to get the fsname */
cp = strchr(fsname, '@');
@@ -2211,7 +2400,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
err = spa_open(fsname, &spa, FTAG);
if (err) {
- kmem_free(fsname, len + 1);
+ kmem_free(fsname, len);
return (err);
}
ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
@@ -2223,7 +2412,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
DS_FIND_CHILDREN);
- kmem_free(fsname, len + 1);
+ kmem_free(fsname, len);
if (err == 0) {
err = dsl_sync_task_group_wait(ra->dstg);
@@ -2234,14 +2423,15 @@ dsl_recursive_rename(char *oldname, const char *newname)
dsl_dataset_t *ds = dst->dst_arg1;
if (dst->dst_err) {
dsl_dir_name(ds->ds_dir, ra->failed);
- (void) strcat(ra->failed, "@");
- (void) strcat(ra->failed, ra->newsnap);
+ (void) strlcat(ra->failed, "@", sizeof (ra->failed));
+ (void) strlcat(ra->failed, ra->newsnap,
+ sizeof (ra->failed));
}
dsl_dataset_rele(ds, ra->dstg);
}
if (err)
- (void) strcpy(oldname, ra->failed);
+ (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
dsl_sync_task_group_destroy(ra->dstg);
kmem_free(ra, sizeof (struct renamesnaparg));
@@ -2250,7 +2440,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
}
static int
-dsl_valid_rename(char *oldname, void *arg)
+dsl_valid_rename(const char *oldname, void *arg)
{
int delta = *(int *)arg;
@@ -2272,12 +2462,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
err = dsl_dir_open(oldname, FTAG, &dd, &tail);
if (err)
return (err);
- /*
- * If there are more than 2 references there may be holds
- * hanging around that haven't been cleared out yet.
- */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
- txg_wait_synced(dd->dd_pool, 0);
+
if (tail == NULL) {
int delta = strlen(newname) - strlen(oldname);
@@ -2286,13 +2471,14 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
err = dmu_objset_find(oldname, dsl_valid_rename,
&delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
- if (!err)
+ if (err == 0)
err = dsl_dir_rename(dd, newname);
dsl_dir_close(dd, FTAG);
return (err);
}
+
if (tail[0] != '@') {
- /* the name ended in a nonexistant component */
+ /* the name ended in a nonexistent component */
dsl_dir_close(dd, FTAG);
return (ENOENT);
}
@@ -2331,13 +2517,14 @@ struct promotenode {
struct promotearg {
list_t shared_snaps, origin_snaps, clone_snaps;
- dsl_dataset_t *origin_origin, *origin_head;
+ dsl_dataset_t *origin_origin;
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+ char *err_ds;
};
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static boolean_t snaplist_unstable(list_t *l);
-/* ARGSUSED */
static int
dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -2346,6 +2533,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
struct promotenode *snap = list_head(&pa->shared_snaps);
dsl_dataset_t *origin_ds = snap->ds;
int err;
+ uint64_t unused;
/* Check that it is a real clone */
if (!dsl_dir_is_clone(hds->ds_dir))
@@ -2361,10 +2549,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* compute origin's new unique space */
snap = list_tail(&pa->clone_snaps);
ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
- err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
- if (err)
- return (err);
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &pa->unique, &unused, &unused);
/*
* Walk the snapshots that we are moving
@@ -2392,18 +2579,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* Check that the snapshot name does not conflict */
VERIFY(0 == dsl_dataset_get_snapname(ds));
err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
- if (err == 0)
- return (EEXIST);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
if (err != ENOENT)
- return (err);
+ goto out;
/* The very first snapshot does not have a deadlist */
if (ds->ds_phys->ds_prev_snap_obj == 0)
continue;
- if (err = bplist_space(&ds->ds_deadlist,
- &dlused, &dlcomp, &dluncomp))
- return (err);
+ dsl_deadlist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp);
pa->used += dlused;
pa->comp += dlcomp;
pa->uncomp += dluncomp;
@@ -2436,19 +2624,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* Note, typically this will not be a clone of a clone,
- * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
- * these snaplist_space() -> bplist_space_birthrange()
+ * so dd_origin_txg will be < TXG_INITIAL, so
+ * these snaplist_space() -> dsl_deadlist_space_range()
* calls will be fast because they do not have to
* iterate over all bps.
*/
snap = list_head(&pa->origin_snaps);
err = snaplist_space(&pa->shared_snaps,
- snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+ snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
if (err)
return (err);
err = snaplist_space(&pa->clone_snaps,
- snap->ds->ds_origin_txg, &space);
+ snap->ds->ds_dir->dd_origin_txg, &space);
if (err)
return (err);
pa->cloneusedsnap += space;
@@ -2461,10 +2649,13 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
return (0);
+out:
+ pa->err_ds = snap->ds->ds_snapname;
+ return (err);
}
static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *hds = arg1;
struct promotearg *pa = arg2;
@@ -2508,10 +2699,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(dd->dd_dbuf, tx);
ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
- hds->ds_origin_txg = origin_head->ds_origin_txg;
+ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
dmu_buf_will_dirty(odd->dd_dbuf, tx);
odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
- origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+ origin_head->ds_dir->dd_origin_txg =
+ origin_ds->ds_phys->ds_creation_txg;
+
+ /* change dd_clone entries */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, hds->ds_object, tx));
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ hds->ds_object, tx));
+
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ origin_head->ds_object, tx));
+ if (dd->dd_phys->dd_clones == 0) {
+ dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+
+ }
/* move snapshots to this dir */
for (snap = list_head(&pa->shared_snaps); snap;
@@ -2519,9 +2731,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_t *ds = snap->ds;
/* unregister props as dsl_dir is changing */
- if (ds->ds_user_ptr) {
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
- ds->ds_user_ptr = NULL;
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
}
/* move snap name entry */
VERIFY(0 == dsl_dataset_get_snapname(ds));
@@ -2530,6 +2742,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
+
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
@@ -2539,6 +2752,40 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
+ /* move any clone references */
+ if (ds->ds_phys->ds_next_clones_obj &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *cnds;
+ uint64_t o;
+
+ if (za.za_first_integer == oldnext_obj) {
+ /*
+ * We've already moved the
+ * origin's reference.
+ */
+ continue;
+ }
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &cnds));
+ o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+
+ VERIFY3U(zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, o, tx), ==, 0);
+ VERIFY3U(zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, o, tx), ==, 0);
+ dsl_dataset_rele(cnds, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ }
+
ASSERT3U(dsl_prop_numcb(ds), ==, 0);
}
@@ -2568,8 +2815,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
origin_ds->ds_phys->ds_unique_bytes = pa->unique;
/* log history record */
- spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", hds->ds_object);
+ spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", hds->ds_object);
dsl_dir_close(odd, FTAG);
}
@@ -2634,11 +2881,9 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
*spacep = 0;
for (snap = list_head(l); snap; snap = list_next(l, snap)) {
- uint64_t used;
- int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- mintxg, UINT64_MAX, &used);
- if (err)
- return (err);
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used, &comp, &uncomp);
*spacep += used;
}
return (0);
@@ -2673,7 +2918,7 @@ snaplist_destroy(list_t *l, boolean_t own)
* NULL, indicating that the clone is not a clone of a clone).
*/
int
-dsl_dataset_promote(const char *name)
+dsl_dataset_promote(const char *name, char *conflsnap)
{
dsl_dataset_t *ds;
dsl_dir_t *dd;
@@ -2725,10 +2970,10 @@ dsl_dataset_promote(const char *name)
if (err != 0)
goto out;
- if (dsl_dir_is_clone(snap->ds->ds_dir)) {
- err = dsl_dataset_own_obj(dp,
+ if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
snap->ds->ds_dir->dd_phys->dd_origin_obj,
- 0, FTAG, &pa.origin_origin);
+ FTAG, &pa.origin_origin);
if (err != 0)
goto out;
}
@@ -2744,14 +2989,16 @@ out:
if (err == 0) {
err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
dsl_dataset_promote_sync, ds, &pa,
- 2 + 2 * doi.doi_physical_blks);
+ 2 + 2 * doi.doi_physical_blocks_512);
+ if (err && pa.err_ds && conflsnap)
+ (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
}
snaplist_destroy(&pa.shared_snaps, B_TRUE);
snaplist_destroy(&pa.clone_snaps, B_FALSE);
snaplist_destroy(&pa.origin_snaps, B_FALSE);
if (pa.origin_origin)
- dsl_dataset_disown(pa.origin_origin, FTAG);
+ dsl_dataset_rele(pa.origin_origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -2778,9 +3025,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (csa->cds->ds_prev != csa->ohds->ds_prev)
return (EINVAL);
- /* cds should be the clone */
- if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
- csa->ohds->ds_object)
+ /* cds should be the clone (unless they are unrelated) */
+ if (csa->cds->ds_prev != NULL &&
+ csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
+ csa->ohds->ds_object !=
+ csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
return (EINVAL);
/* the clone should be a child of the origin */
@@ -2803,38 +3052,49 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
+ if (csa->ohds->ds_quota != 0 &&
+ csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
+ return (EDQUOT);
+
return (0);
}
/* ARGSUSED */
static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
struct cloneswaparg *csa = arg1;
dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
ASSERT(csa->cds->ds_reserved == 0);
- ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+ ASSERT(csa->ohds->ds_quota == 0 ||
+ csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
- dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
- if (csa->cds->ds_user_ptr != NULL) {
- csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
- csa->cds->ds_user_ptr = NULL;
+ if (csa->cds->ds_objset != NULL) {
+ dmu_objset_evict(csa->cds->ds_objset);
+ csa->cds->ds_objset = NULL;
}
- if (csa->ohds->ds_user_ptr != NULL) {
- csa->ohds->ds_user_evict_func(csa->ohds,
- csa->ohds->ds_user_ptr);
- csa->ohds->ds_user_ptr = NULL;
+ if (csa->ohds->ds_objset != NULL) {
+ dmu_objset_evict(csa->ohds->ds_objset);
+ csa->ohds->ds_objset = NULL;
}
- /* reset origin's unique bytes */
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
- &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+ /*
+ * Reset origin's unique bytes, if it exists.
+ */
+ if (csa->cds->ds_prev) {
+ dsl_dataset_t *origin = csa->cds->ds_prev;
+ uint64_t comp, uncomp;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+ origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+ }
/* swap blkptrs */
{
@@ -2853,10 +3113,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT3U(csa->cds->ds_dir->dd_phys->
dd_used_breakdown[DD_USED_SNAP], ==, 0);
- VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
- &cdl_comp, &cdl_uncomp));
- VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
- &odl_comp, &odl_uncomp));
+ dsl_deadlist_space(&csa->cds->ds_deadlist,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space(&csa->ohds->ds_deadlist,
+ &odl_used, &odl_comp, &odl_uncomp);
dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
(csa->ohds->ds_phys->ds_used_bytes + odl_used);
@@ -2877,21 +3137,16 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
- VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &odl_used, &odl_comp, &odl_uncomp);
dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
-#define SWITCH64(x, y) \
- { \
- uint64_t __tmp = (x); \
- (x) = (y); \
- (y) = __tmp; \
- }
-
/* swap ds_*_bytes */
SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
csa->cds->ds_phys->ds_used_bytes);
@@ -2906,22 +3161,26 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
csa->unused_refres_delta, 0, 0, tx);
- /* swap deadlists */
- bplist_close(&csa->cds->ds_deadlist);
- bplist_close(&csa->ohds->ds_deadlist);
+ /*
+ * Swap deadlists.
+ */
+ dsl_deadlist_close(&csa->cds->ds_deadlist);
+ dsl_deadlist_close(&csa->ohds->ds_deadlist);
SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
csa->cds->ds_phys->ds_deadlist_obj);
- VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
- csa->cds->ds_phys->ds_deadlist_obj));
- VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
- csa->ohds->ds_phys->ds_deadlist_obj));
+ dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+ csa->cds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+ csa->ohds->ds_phys->ds_deadlist_obj);
- dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+ dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
}
/*
- * Swap 'clone' with its origin head file system. Used at the end
- * of "online recv" to swizzle the file system to the new version.
+ * Swap 'clone' with its origin head datasets. Used at the end of "zfs
+ * recv" into an existing fs to swizzle the file system to the new
+ * version, and by "zfs rollback". Can also be used to swap two
+ * independent head datasets if neither has any snapshots.
*/
int
dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
@@ -2933,9 +3192,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
ASSERT(clone->ds_owner);
ASSERT(origin_head->ds_owner);
retry:
- /* Need exclusive access for the swap */
- rw_enter(&clone->ds_rwlock, RW_WRITER);
- if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+ /*
+ * Need exclusive access for the swap. If we're swapping these
+ * datasets back after an error, we already hold the locks.
+ */
+ if (!RW_WRITE_HELD(&clone->ds_rwlock))
+ rw_enter(&clone->ds_rwlock, RW_WRITER);
+ if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
+ !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
rw_exit(&clone->ds_rwlock);
rw_enter(&origin_head->ds_rwlock, RW_WRITER);
if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
@@ -3030,62 +3294,70 @@ static int
dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
+ dsl_prop_setarg_t *psa = arg2;
+ int err;
if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
return (ENOTSUP);
- if (new_quota == 0)
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ if (psa->psa_effective_value == 0)
return (0);
- if (new_quota < ds->ds_phys->ds_used_bytes ||
- new_quota < ds->ds_reserved)
+ if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+ psa->psa_effective_value < ds->ds_reserved)
return (ENOSPC);
return (0);
}
-/* ARGSUSED */
+extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
+
void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
- ds->ds_quota = new_quota;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
- dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+ if (ds->ds_quota != effective_value) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_quota = effective_value;
- spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu ",
- (longlong_t)new_quota, ds->ds_object);
+ spa_history_log_internal(LOG_DS_REFQUOTA,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
+ (longlong_t)ds->ds_quota, ds->ds_object);
+ }
}
int
-dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
{
dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
+ dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
+
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
- if (quota != ds->ds_quota) {
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(ds->ds_dir->dd_pool, 0);
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+ ds, &psa, 0);
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
- ds, &quota, 0);
- }
dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -3094,9 +3366,10 @@ static int
dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value;
uint64_t unique;
+ int err;
if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
SPA_VERSION_REFRESERVATION)
@@ -3105,6 +3378,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (dsl_dataset_is_snapshot(ds))
return (EINVAL);
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ effective_value = psa->psa_effective_value;
+
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
@@ -3113,67 +3391,645 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
+ if (!DS_UNIQUE_IS_ACCURATE(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+ unique = ds->ds_phys->ds_unique_bytes;
mutex_exit(&ds->ds_lock);
- if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
- uint64_t delta = MAX(unique, new_reservation) -
+ if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
+ uint64_t delta = MAX(unique, effective_value) -
MAX(unique, ds->ds_reserved);
if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
if (ds->ds_quota > 0 &&
- new_reservation > ds->ds_quota)
+ effective_value > ds->ds_quota)
return (ENOSPC);
}
return (0);
}
-/* ARGSUSED */
static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx)
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
uint64_t unique;
int64_t delta;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
- delta = MAX(0, (int64_t)(new_reservation - unique)) -
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ unique = ds->ds_phys->ds_unique_bytes;
+ delta = MAX(0, (int64_t)(effective_value - unique)) -
MAX(0, (int64_t)(ds->ds_reserved - unique));
- ds->ds_reserved = new_reservation;
+ ds->ds_reserved = effective_value;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
mutex_exit(&ds->ds_dir->dd_lock);
- dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
- new_reservation, cr, tx);
- spa_history_internal_log(LOG_DS_REFRESERV,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
- (longlong_t)new_reservation, ds->ds_object);
+ spa_history_log_internal(LOG_DS_REFRESERV,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
+ (longlong_t)effective_value, ds->ds_object);
}
int
-dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation)
{
dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
+ dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
+ &reservation);
+
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_set_reservation_check,
- dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+ dsl_dataset_set_reservation_sync, ds, &psa, 0);
+
dsl_dataset_rele(ds, FTAG);
return (err);
}
+
+typedef struct zfs_hold_cleanup_arg {
+ dsl_pool_t *dp;
+ uint64_t dsobj;
+ char htag[MAXNAMELEN];
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+ zfs_hold_cleanup_arg_t *ca = arg;
+
+ (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
+ B_TRUE);
+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+}
+
+void
+dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor)
+{
+ zfs_hold_cleanup_arg_t *ca;
+
+ ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
+ ca->dp = ds->ds_dir->dd_pool;
+ ca->dsobj = ds->ds_object;
+ (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
+ VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
+ dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
+static int
+dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct dsl_ds_holdarg *ha = arg2;
+ char *htag = ha->htag;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int error = 0;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ if (!dsl_dataset_is_snapshot(ds))
+ return (EINVAL);
+
+ /* tags must be unique */
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj) {
+ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
+ 8, 1, tx);
+ if (error == 0)
+ error = EEXIST;
+ else if (error == ENOENT)
+ error = 0;
+ }
+ mutex_exit(&ds->ds_lock);
+
+ if (error == 0 && ha->temphold &&
+ strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ error = E2BIG;
+
+ return (error);
+}
+
+void
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct dsl_ds_holdarg *ha = arg2;
+ char *htag = ha->htag;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t now = gethrestime_sec();
+ uint64_t zapobj;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj == 0) {
+ /*
+ * This is the first user hold for this dataset. Create
+ * the userrefs zap object.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zapobj = ds->ds_phys->ds_userrefs_obj =
+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+ } else {
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ }
+ ds->ds_userrefs++;
+ mutex_exit(&ds->ds_lock);
+
+ VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+ if (ha->temphold) {
+ VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
+ htag, &now, tx));
+ }
+
+ spa_history_log_internal(LOG_DS_USER_HOLD,
+ dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
+ (int)ha->temphold, ds->ds_object);
+}
+
+static int
+dsl_dataset_user_hold_one(const char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ dsl_dataset_t *ds;
+ int error;
+ char *name;
+
+ /* alloc a buffer to hold dsname@snapname plus terminating NULL */
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, ha->dstg, &ds);
+ strfree(name);
+ if (error == 0) {
+ ha->gotone = B_TRUE;
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
+ dsl_dataset_user_hold_sync, ds, ha, 0);
+ } else if (error == ENOENT && ha->recursive) {
+ error = 0;
+ } else {
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+ }
+ return (error);
+}
+
+int
+dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold)
+{
+ struct dsl_ds_holdarg *ha;
+ int error;
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ ha->htag = htag;
+ ha->temphold = temphold;
+ error = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
+ ds, ha, 0);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+
+ return (error);
+}
+
+int
+dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive, boolean_t temphold, int cleanup_fd)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+ minor_t minor = 0;
+
+ if (cleanup_fd != -1) {
+ /* Currently we only support cleanup-on-exit of tempholds. */
+ if (!temphold)
+ return (EINVAL);
+ error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (error)
+ return (error);
+ }
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ ha->temphold = temphold;
+
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_hold_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, ha->failed);
+ *strchr(ha->failed, '@') = '\0';
+ } else if (error == 0 && minor != 0 && temphold) {
+ /*
+ * If this hold is to be released upon process exit,
+ * register that action now.
+ */
+ dsl_register_onexit_hold_cleanup(ds, htag, minor);
+ }
+ dsl_dataset_rele(ds, ha->dstg);
+ }
+
+ if (error == 0 && recursive && !ha->gotone)
+ error = ENOENT;
+
+ if (error)
+ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+ dsl_sync_task_group_destroy(ha->dstg);
+
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (error);
+}
+
+struct dsl_ds_releasearg {
+ dsl_dataset_t *ds;
+ const char *htag;
+ boolean_t own; /* do we own or just hold ds? */
+};
+
+static int
+dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
+ boolean_t *might_destroy)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t tmp;
+ int error;
+
+ *might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ if (zapobj == 0) {
+ /* The tag can't possibly exist */
+ mutex_exit(&ds->ds_lock);
+ return (ESRCH);
+ }
+
+ /* Make sure the tag exists */
+ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
+ if (error) {
+ mutex_exit(&ds->ds_lock);
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds))
+ *might_destroy = B_TRUE;
+
+ mutex_exit(&ds->ds_lock);
+ return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ boolean_t might_destroy;
+ int error;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
+ if (error)
+ return (error);
+
+ if (might_destroy) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ if (dmu_tx_is_syncing(tx)) {
+ /*
+ * If we're not prepared to remove the snapshot,
+ * we can't allow the release to happen right now.
+ */
+ if (!ra->own)
+ return (EBUSY);
+ }
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ return (dsl_dataset_destroy_check(&dsda, tag, tx));
+ }
+
+ return (0);
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t dsobj = ds->ds_object;
+ uint64_t refs;
+ int error;
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_userrefs--;
+ refs = ds->ds_userrefs;
+ mutex_exit(&ds->ds_lock);
+ error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
+ VERIFY(error == 0 || error == ENOENT);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
+ if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds)) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ ASSERT(ra->own);
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ /* We already did the destroy_check */
+ dsl_dataset_destroy_sync(&dsda, tag, tx);
+ }
+
+ spa_history_log_internal(LOG_DS_USER_RELEASE,
+ dp->dp_spa, tx, "<%s> %lld dataset = %llu",
+ ra->htag, (longlong_t)refs, dsobj);
+}
+
+static int
+dsl_dataset_user_release_one(const char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ struct dsl_ds_releasearg *ra;
+ dsl_dataset_t *ds;
+ int error;
+ void *dtag = ha->dstg;
+ char *name;
+ boolean_t own = B_FALSE;
+ boolean_t might_destroy;
+
+ /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, dtag, &ds);
+ strfree(name);
+ if (error == ENOENT && ha->recursive)
+ return (0);
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+ if (error)
+ return (error);
+
+ ha->gotone = B_TRUE;
+
+ ASSERT(dsl_dataset_is_snapshot(ds));
+
+ error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+
+ if (might_destroy) {
+#ifdef _KERNEL
+ name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+ error = zfs_unmount_snap(name, NULL);
+ strfree(name);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+#endif
+ if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
+ dsl_dataset_rele(ds, dtag);
+ return (EBUSY);
+ } else {
+ own = B_TRUE;
+ dsl_dataset_make_exclusive(ds, dtag);
+ }
+ }
+
+ ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
+ ra->ds = ds;
+ ra->htag = ha->htag;
+ ra->own = own;
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
+ dsl_dataset_user_release_sync, ra, dtag, 0);
+
+ return (0);
+}
+
+int
+dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+
+top:
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_release_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ struct dsl_ds_releasearg *ra = dst->dst_arg1;
+ dsl_dataset_t *ds = ra->ds;
+
+ if (dst->dst_err)
+ dsl_dataset_name(ds, ha->failed);
+
+ if (ra->own)
+ dsl_dataset_disown(ds, ha->dstg);
+ else
+ dsl_dataset_rele(ds, ha->dstg);
+
+ kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+ }
+
+ if (error == 0 && recursive && !ha->gotone)
+ error = ENOENT;
+
+ if (error && error != EBUSY)
+ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+ dsl_sync_task_group_destroy(ha->dstg);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+
+ /*
+ * We can get EBUSY if we were racing with deferred destroy and
+ * dsl_dataset_user_release_check() hadn't done the necessary
+ * open context setup. We can also get EBUSY if we're racing
+ * with destroy and that thread is the ds_owner. Either way
+ * the busy condition should be transient, and we should retry
+ * the release operation.
+ */
+ if (error == EBUSY)
+ goto top;
+
+ return (error);
+}
+
+/*
+ * Called at spa_load time (with retry == B_FALSE) to release a stale
+ * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
+ */
+int
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
+ boolean_t retry)
+{
+ dsl_dataset_t *ds;
+ char *snap;
+ char *name;
+ int namelen;
+ int error;
+
+ do {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error)
+ return (error);
+ namelen = dsl_dataset_namelen(ds)+1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(ds, name);
+ dsl_dataset_rele(ds, FTAG);
+
+ snap = strchr(name, '@');
+ *snap = '\0';
+ ++snap;
+ error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
+ kmem_free(name, namelen);
+
+ /*
+ * The object can't have been destroyed because we have a hold,
+ * but it might have been renamed, resulting in ENOENT. Retry
+ * if we've been requested to do so.
+ *
+ * It would be nice if we could use the dsobj all the way
+ * through and avoid ENOENT entirely. But we might need to
+ * unmount the snapshot, and there's currently no way to lookup
+ * a vfsp using a ZFS object id.
+ */
+ } while ((error == ENOENT) && retry);
+
+ return (error);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (err)
+ return (err);
+
+ VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
+ if (ds->ds_phys->ds_userrefs_obj != 0) {
+ zap_attribute_t *za;
+ zap_cursor_t zc;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
+ za->za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/*
+ * Note, this fuction is used as the callback for dmu_objset_find(). We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+ dsl_dataset_t *ds;
+
+ if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
+ if (DS_IS_INCONSISTENT(ds))
+ (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
+ else
+ dsl_dataset_disown(ds, FTAG);
+ }
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
new file mode 100644
index 000000000000..064f8aceb8ee
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_entry_t *dle1 = arg1;
+ const dsl_deadlist_entry_t *dle2 = arg2;
+
+ if (dle1->dle_mintxg < dle2->dle_mintxg)
+ return (-1);
+ else if (dle1->dle_mintxg > dle2->dle_mintxg)
+ return (+1);
+ else
+ return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havetree)
+ return;
+
+ avl_create(&dl->dl_tree, dsl_deadlist_compare,
+ sizeof (dsl_deadlist_entry_t),
+ offsetof(dsl_deadlist_entry_t, dle_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+ za.za_first_integer));
+ avl_add(&dl->dl_tree, dle);
+ }
+ zap_cursor_fini(&zc);
+ dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+ dl->dl_os = os;
+ dl->dl_object = object;
+ VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+ dmu_object_info_from_db(dl->dl_dbuf, &doi);
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_oldfmt = B_TRUE;
+ VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+ return;
+ }
+
+ dl->dl_oldfmt = B_FALSE;
+ dl->dl_phys = dl->dl_dbuf->db_data;
+ dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+ void *cookie = NULL;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt) {
+ dl->dl_oldfmt = B_FALSE;
+ bpobj_close(&dl->dl_bpobj);
+ return;
+ }
+
+ if (dl->dl_havetree) {
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+ != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+ }
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ mutex_destroy(&dl->dl_lock);
+ dl->dl_dbuf = NULL;
+ dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+ sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+ dmu_object_info_t doi;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_free(os, dlobj, tx);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, os, dlobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc))
+ bpobj_free(os, za.za_first_integer, tx);
+ zap_cursor_fini(&zc);
+ VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used +=
+ bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+ dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+ dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = bp->blk_birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ else
+ dle = AVL_PREV(&dl->dl_tree, dle);
+ bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = mintxg;
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ avl_add(&dl->dl_tree, dle);
+
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+ mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle, *dle_prev;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+ bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+ dle->dle_bpobj.bpo_object, tx);
+
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_t dl;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ dsl_deadlist_open(&dl, os, dlobj);
+ if (dl.dl_oldfmt) {
+ dsl_deadlist_close(&dl);
+ return;
+ }
+
+ while (mrs_obj != 0) {
+ dsl_dataset_t *ds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+ dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+ mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t *dle;
+ uint64_t newobj;
+
+ newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+ if (dl->dl_oldfmt) {
+ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+ return (newobj);
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t obj;
+
+ if (dle->dle_mintxg >= maxtxg)
+ break;
+
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+ dle->dle_mintxg, obj, tx));
+ }
+ return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+ usedp, compp, uncompp));
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ *usedp = dl->dl_phys->dl_used;
+ *compp = dl->dl_phys->dl_comp;
+ *uncompp = dl->dl_phys->dl_uncomp;
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+ mintxg, maxtxg, usedp, compp, uncompp));
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+ *usedp = *compp = *uncompp = 0;
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ /*
+ * If we don't find this mintxg, there shouldn't be anything
+ * after it either.
+ */
+ ASSERT(dle != NULL ||
+ avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+ for (; dle && dle->dle_mintxg < maxtxg;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t used, comp, uncomp;
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+ }
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+ uint64_t used, comp, uncomp;
+ bpobj_t bpo;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+ bpobj_close(&bpo);
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used += used;
+ dl->dl_phys->dl_comp += comp;
+ dl->dl_phys->dl_uncomp += uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_buf_t *bonus;
+ dsl_deadlist_phys_t *dlp;
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_t bpo;
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_iterate(&bpo,
+ dsl_deadlist_insert_cb, dl, tx));
+ bpobj_close(&bpo);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, dl->dl_os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t mintxg = strtonum(za.za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+ dlp = bonus->db_data;
+ dmu_buf_will_dirty(bonus, tx);
+ bzero(dlp, sizeof (*dlp));
+ dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(!dl->dl_oldfmt);
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ while (dle) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t *dle_next;
+
+ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+ mutex_enter(&dl->dl_lock);
+ ASSERT3U(dl->dl_phys->dl_used, >=, used);
+ ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, tx));
+
+ dle_next = AVL_NEXT(&dl->dl_tree, dle);
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ dle = dle_next;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
index 7ff843044885..b85c373e3d03 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -75,8 +74,6 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/cred.h>
@@ -150,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
}
static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -185,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(zap_update(mos, jumpobj,
perm, 8, 1, &n, tx) == 0);
- spa_history_internal_log(LOG_DS_PERM_UPDATE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_UPDATE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
@@ -194,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -217,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
(void) zap_remove(mos, zapobj, whokey, tx);
VERIFY(0 == zap_destroy(mos, jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s dataset = %llu", whokey,
dd->dd_phys->dd_head_dataset_obj);
continue;
@@ -238,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_destroy(mos,
jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
@@ -531,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
* Check if user has requested permission.
*/
int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
{
- dsl_dataset_t *ds;
dsl_dir_t *dd;
dsl_pool_t *dp;
void *cookie;
@@ -543,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
avl_tree_t permsets;
perm_set_t *setnode;
- error = dsl_dataset_hold(dsname, FTAG, &ds);
- if (error)
- return (error);
-
dp = ds->ds_dir->dd_pool;
mos = dp->dp_meta_objset;
- if (dsl_delegation_on(mos) == B_FALSE) {
- dsl_dataset_rele(ds, FTAG);
+ if (dsl_delegation_on(mos) == B_FALSE)
return (ECANCELED);
- }
if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
- SPA_VERSION_DELEGATED_PERMS) {
- dsl_dataset_rele(ds, FTAG);
+ SPA_VERSION_DELEGATED_PERMS)
return (EPERM);
- }
if (dsl_dataset_is_snapshot(ds)) {
/*
@@ -589,7 +577,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
if (dsl_prop_get_dd(dd,
zfs_prop_to_name(ZFS_PROP_ZONED),
- 8, 1, &zoned, NULL) != 0)
+ 8, 1, &zoned, NULL, B_FALSE) != 0)
break;
if (!zoned)
break;
@@ -636,7 +624,6 @@ again:
error = EPERM;
success:
rw_exit(&dp->dp_config_rwlock);
- dsl_dataset_rele(ds, FTAG);
cookie = NULL;
while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -645,6 +632,22 @@ success:
return (error);
}
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (error)
+ return (error);
+
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
/*
* Other routines.
*/
@@ -739,5 +742,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
boolean_t
dsl_delegation_on(objset_t *os)
{
- return (os->os->os_spa->spa_delegation);
+ return (!!spa_delegation(os->os_spa));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 2f312ae3410c..1cd49c8274e8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -32,6 +31,7 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
+#include <sys/metaslab.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/arc.h>
@@ -39,8 +39,7 @@
#include "zfs_namecheck.h"
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
/* ARGSUSED */
@@ -63,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
spa_close(dd->dd_pool->dp_spa, dd);
/*
- * The props callback list should be empty since they hold the
- * dir open.
+ * The props callback list should have been cleaned up by
+ * objset_evict().
*/
list_destroy(&dd->dd_prop_cbs);
mutex_destroy(&dd->dd_lock);
@@ -107,6 +106,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_node));
+ dsl_dir_snap_cmtime_update(dd);
+
if (dd->dd_phys->dd_parent_obj) {
err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
NULL, dd, &dd->dd_parent);
@@ -133,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
}
+ if (dsl_dir_is_clone(dd)) {
+ dmu_buf_t *origin_bonus;
+ dsl_dataset_phys_t *origin_phys;
+
+ /*
+ * We can't open the origin dataset, because
+ * that would require opening this dsl_dir.
+ * Just look at its phys directly instead.
+ */
+ err = dmu_bonus_hold(dp->dp_meta_objset,
+ dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+ if (err)
+ goto errout;
+ origin_phys = origin_bonus->db_data;
+ dd->dd_origin_txg =
+ origin_phys->ds_creation_txg;
+ dmu_buf_rele(origin_bonus, FTAG);
+ }
+
winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
dsl_dir_evict);
if (winner) {
@@ -392,7 +412,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
{
objset_t *mos = dp->dp_meta_objset;
uint64_t ddobj;
- dsl_dir_phys_t *dsphys;
+ dsl_dir_phys_t *ddphys;
dmu_buf_t *dbuf;
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@@ -407,17 +427,17 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
}
VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
+ ddphys = dbuf->db_data;
- dsphys->dd_creation_time = gethrestime_sec();
+ ddphys->dd_creation_time = gethrestime_sec();
if (pds)
- dsphys->dd_parent_obj = pds->dd_object;
- dsphys->dd_props_zapobj = zap_create(mos,
+ ddphys->dd_parent_obj = pds->dd_object;
+ ddphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsphys->dd_child_dir_zapobj = zap_create(mos,
+ ddphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
- dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
@@ -427,7 +447,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
int
dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
dsl_pool_t *dp = dd->dd_pool;
objset_t *mos = dp->dp_meta_objset;
int err;
@@ -454,19 +475,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t val, obj;
+ dsl_prop_setarg_t psa;
+ uint64_t value = 0;
+ uint64_t obj;
dd_used_t t;
ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
/* Remove our reservation. */
- val = 0;
- dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+ dsl_prop_setarg_init_uint64(&psa, "reservation",
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ &value);
+ psa.psa_effective_value = 0; /* predict default value */
+
+ dsl_dir_set_reservation_sync(ds, &psa, tx);
+
ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
for (t = 0; t < DD_USED_NUM; t++)
@@ -640,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
if (used > quota) {
/* over quota */
myspace = 0;
-
- /*
- * While it's OK to be a little over quota, if
- * we think we are using more space than there
- * is in the pool (which is already 1.6% more than
- * dsl_pool_adjustedsize()), something is very
- * wrong.
- */
- ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
} else {
/*
* the lesser of the space provided by our parent and
@@ -676,8 +696,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
{
uint64_t txg = tx->tx_txg;
uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+ uint64_t deferred = 0;
struct tempreserve *tr;
- int enospc = EDQUOT;
+ int retval = EDQUOT;
int txgidx = txg & TXG_MASK;
int i;
uint64_t ref_rsrv = 0;
@@ -703,7 +724,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
*/
if (first && tx->tx_objset) {
int error;
- dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+ dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
error = dsl_dataset_check_quota(ds, checkrefquota,
asize, est_inflight, &used_on_disk, &ref_rsrv);
@@ -723,7 +744,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
quota = dd->dd_phys->dd_quota;
/*
- * Adjust the quota against the actual pool size at the root.
+ * Adjust the quota against the actual pool size at the root
+ * minus any outstanding deferred frees.
* To ensure that it's possible to remove files from a full
* pool without inducing transient overcommits, we throttle
* netfree transactions against a quota that is slightly larger,
@@ -732,10 +754,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* removes to get through.
*/
if (dd->dd_parent == NULL) {
+ spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
- if (poolsize < quota) {
- quota = poolsize;
- enospc = ENOSPC;
+ deferred = metaslab_class_get_deferred(spa_normal_class(spa));
+ if (poolsize - deferred < quota) {
+ quota = poolsize - deferred;
+ retval = ENOSPC;
}
}
@@ -745,15 +769,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
- if (used_on_disk + est_inflight > quota) {
- if (est_inflight > 0 || used_on_disk < quota)
- enospc = ERESTART;
+ if (used_on_disk + est_inflight >= quota) {
+ if (est_inflight > 0 || used_on_disk < quota ||
+ (retval == ENOSPC && used_on_disk < quota + deferred))
+ retval = ERESTART;
dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
"quota=%lluK tr=%lluK err=%d\n",
used_on_disk>>10, est_inflight>>10,
- quota>>10, asize>>10, enospc);
+ quota>>10, asize>>10, retval);
mutex_exit(&dd->dd_lock);
- return (enospc);
+ return (retval);
}
/* We need to up our estimated delta before dropping dd_lock */
@@ -987,13 +1012,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
static int
dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
- int err = 0;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ int err;
uint64_t towrite;
- if (new_quota == 0)
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ if (psa->psa_effective_value == 0)
return (0);
mutex_enter(&dd->dd_lock);
@@ -1005,64 +1033,88 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
*/
towrite = dsl_dir_space_towrite(dd);
if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
- (new_quota < dd->dd_phys->dd_reserved ||
- new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
+ (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
+ psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
err = ENOSPC;
}
mutex_exit(&dd->dd_lock);
return (err);
}
-/* ARGSUSED */
+extern dsl_syncfunc_t dsl_prop_set_sync;
+
static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
+
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(dd, psa);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
- dd->dd_phys->dd_quota = new_quota;
+ dd->dd_phys->dd_quota = effective_value;
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu ",
- (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu ",
+ (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
{
dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+
+ err = dsl_dataset_hold(ddname, FTAG, &ds);
if (err)
return (err);
- if (quota != dd->dd_phys->dd_quota) {
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(dd->dd_pool, 0);
-
- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, dd, &quota, 0);
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
}
+
+ ASSERT(ds->ds_dir == dd);
+
+ /*
+ * If someone removes a file, then tries to set the quota, we want to
+ * make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, ds, &psa, 0);
+
dsl_dir_close(dd, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
int
dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value;
uint64_t used, avail;
+ int err;
+
+ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+ return (err);
+
+ effective_value = psa->psa_effective_value;
/*
* If we are doing the preliminary check in open context, the
@@ -1082,37 +1134,40 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
}
- if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
- uint64_t delta = MAX(used, new_reservation) -
+ if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
+ uint64_t delta = MAX(used, effective_value) -
MAX(used, dd->dd_phys->dd_reserved);
if (delta > avail)
return (ENOSPC);
if (dd->dd_phys->dd_quota > 0 &&
- new_reservation > dd->dd_phys->dd_quota)
+ effective_value > dd->dd_phys->dd_quota)
return (ENOSPC);
}
return (0);
}
-/* ARGSUSED */
static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
+ dsl_dataset_t *ds = arg1;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_setarg_t *psa = arg2;
+ uint64_t effective_value = psa->psa_effective_value;
uint64_t used;
int64_t delta;
+ dsl_prop_set_sync(ds, psa, tx);
+ DSL_PROP_CHECK_PREDICTION(dd, psa);
+
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
used = dd->dd_phys->dd_used_bytes;
- delta = MAX(used, new_reservation) -
+ delta = MAX(used, effective_value) -
MAX(used, dd->dd_phys->dd_reserved);
- dd->dd_phys->dd_reserved = new_reservation;
+ dd->dd_phys->dd_reserved = effective_value;
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
@@ -1121,23 +1176,39 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu",
- (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu",
+ (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation)
{
dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_prop_setarg_t psa;
int err;
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+
+ err = dsl_dataset_hold(ddname, FTAG, &ds);
if (err)
return (err);
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ ASSERT(ds->ds_dir == dd);
+
err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
- dsl_dir_set_reservation_sync, dd, &reservation, 0);
+ dsl_dir_set_reservation_sync, ds, &psa, 0);
+
dsl_dir_close(dd, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -1175,7 +1246,6 @@ struct renamearg {
const char *mynewname;
};
-/*ARGSUSED*/
static int
dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1186,8 +1256,14 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
int err;
uint64_t val;
- /* There should be 2 references: the open and the dirty */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ /*
+ * There should only be one reference, from dmu_objset_rename().
+ * Fleeting holds are also possible (eg, from "zfs list" getting
+ * stats), but any that are present in open context will likely
+ * be gone by syncing context, so only fail from syncing
+ * context.
+ */
+ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
return (EBUSY);
/* check for existing name */
@@ -1216,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct renamearg *ra = arg2;
@@ -1265,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dd->dd_myname, 8, 1, &dd->dd_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+ tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
}
int
@@ -1315,3 +1391,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
return (0);
}
+
+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ mutex_enter(&dd->dd_lock);
+ t = dd->dd_snap_cmtime;
+ mutex_exit(&dd->dd_lock);
+
+ return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ gethrestime(&t);
+ mutex_enter(&dd->dd_lock);
+ dd->dd_snap_cmtime = t;
+ mutex_exit(&dd->dd_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 0f00bc965dcd..ea5e60d933e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -19,14 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
@@ -36,22 +38,47 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
-int zfs_txg_synctime = 5; /* target secs to sync a txg */
+int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
uint64_t zfs_write_limit_inflated = 0;
uint64_t zfs_write_limit_override = 0;
-extern uint64_t zfs_write_limit_min;
kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
-static int
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN,
+ &zfs_no_write_throttle, 0, "");
+TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN,
+ &zfs_write_limit_shift, 0, "2^N of physical memory");
+SYSCTL_DECL(_vfs_zfs_txg);
+TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms);
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN,
+ &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg");
+
+TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN,
+ &zfs_write_limit_min, 0, "Minimum write limit");
+TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN,
+ &zfs_write_limit_max, 0, "Maximum data payload per txg");
+TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN,
+ &zfs_write_limit_inflated, 0, "");
+TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
+ &zfs_write_limit_override, 0, "");
+
+int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
uint64_t obj;
@@ -89,7 +116,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_dataset_t, ds_synced_link));
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
@@ -104,13 +130,13 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dsl_dir_t *dd;
dsl_dataset_t *ds;
- objset_impl_t *osi;
+ uint64_t obj;
rw_enter(&dp->dp_config_rwlock, RW_WRITER);
- err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+ &dp->dp_meta_objset);
if (err)
goto out;
- dp->dp_meta_objset = &osi->os;
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
@@ -135,8 +161,8 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
FTAG, &ds);
if (err == 0) {
err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, dp,
- &dp->dp_origin_snap);
+ ds->ds_phys->ds_prev_snap_obj, dp,
+ &dp->dp_origin_snap);
dsl_dataset_rele(ds, FTAG);
}
dsl_dir_close(dd, dp);
@@ -144,53 +170,30 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
goto out;
}
- /* get scrub status */
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func);
- if (err == 0) {
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark);
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+ &dp->dp_free_dir);
if (err)
goto out;
+
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors);
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
if (err)
goto out;
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- /*
- * A new-type scrub was in progress on an old
- * pool. Restart from the beginning, since the
- * old software may have changed the pool in the
- * meantime.
- */
- dsl_pool_scrub_restart(dp);
- }
- } else {
- /*
- * It's OK if there is no scrub in progress (and if
- * there was an I/O error, ignore it).
- */
- err = 0;
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
}
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+ &dp->dp_tmp_userrefs_obj);
+ if (err == ENOENT)
+ err = 0;
+ if (err)
+ goto out;
+
+ err = dsl_scan_init(dp, txg);
+
out:
rw_exit(&dp->dp_config_rwlock);
if (err)
@@ -215,23 +218,27 @@ dsl_pool_close(dsl_pool_t *dp)
dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
if (dp->dp_mos_dir)
dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_free_dir)
+ dsl_dir_close(dp->dp_free_dir, dp);
if (dp->dp_root_dir)
dsl_dir_close(dp->dp_root_dir, dp);
+ bpobj_close(&dp->dp_free_bpobj);
+
/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
if (dp->dp_meta_objset)
- dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+ dmu_objset_evict(dp->dp_meta_objset);
txg_list_destroy(&dp->dp_dirty_datasets);
- txg_list_destroy(&dp->dp_dirty_dirs);
txg_list_destroy(&dp->dp_sync_tasks);
+ txg_list_destroy(&dp->dp_dirty_dirs);
list_destroy(&dp->dp_synced_datasets);
arc_flush(dp->dp_spa);
txg_fini(dp);
+ dsl_scan_fini(dp);
rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
- mutex_destroy(&dp->dp_scrub_cancel_lock);
taskq_destroy(dp->dp_vnrele_taskq);
if (dp->dp_blkstats)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -244,19 +251,22 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
- objset_impl_t *osip;
+ objset_t *os;
dsl_dataset_t *ds;
- uint64_t dsobj;
+ uint64_t obj;
/* create and open the MOS (meta-objset) */
- dp->dp_meta_objset = &dmu_objset_create_impl(spa,
- NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+ dp->dp_meta_objset = dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
ASSERT3U(err, ==, 0);
+ /* Initialize scan structures */
+ VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
/* create and open the root dir */
dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@@ -267,18 +277,33 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
VERIFY(0 == dsl_pool_open_special_dir(dp,
MOS_DIR_NAME, &dp->dp_mos_dir));
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ /* create and open the free dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /* create and open the free_bplist */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
/* create the root dataset */
- dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
/* create the root objset */
- VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- osip = dmu_objset_create_impl(dp->dp_spa, ds,
+ VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+ os = dmu_objset_create_impl(dp->dp_spa, ds,
dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
#ifdef _KERNEL
- zfs_create_fs(&osip->os, kcred, zplprops, tx);
+ zfs_create_fs(os, kcred, zplprops, tx);
#endif
dsl_dataset_rele(ds, FTAG);
@@ -287,6 +312,14 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
return (dp);
}
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
@@ -295,11 +328,19 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dir_t *dd;
dsl_dataset_t *ds;
dsl_sync_task_group_t *dstg;
- objset_impl_t *mosi = dp->dp_meta_objset->os;
+ objset_t *mos = dp->dp_meta_objset;
hrtime_t start, write_time;
uint64_t data_written;
int err;
+ /*
+ * We need to copy dp_space_towrite() before doing
+ * dsl_sync_task_group_sync(), because
+ * dsl_dataset_snapshot_reserve_space() will increase
+ * dp_space_towrite but not actually write anything.
+ */
+ data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
tx = dmu_tx_create_assigned(dp, txg);
dp->dp_read_overhead = 0;
@@ -325,11 +366,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
for (ds = list_head(&dp->dp_synced_datasets); ds;
ds = list_next(&dp->dp_synced_datasets, ds))
- dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+ dmu_objset_do_userquota_updates(ds->ds_objset, tx);
/*
* Sync the datasets again to push out the changes due to
- * userquota updates. This must be done before we process the
+ * userspace updates. This must be done before we process the
* sync tasks, because that could cause a snapshot of a dataset
* whose ds_bp will be rewritten when we do this 2nd sync.
*/
@@ -341,6 +382,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}
err = zio_wait(zio);
+ /*
+ * Move dead blocks from the pending deadlist to the on-disk
+ * deadlist.
+ */
+ for (ds = list_head(&dp->dp_synced_datasets); ds;
+ ds = list_next(&dp->dp_synced_datasets, ds)) {
+ bplist_iterate(&ds->ds_pending_deadlist,
+ deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+ }
+
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
/*
* No more sync tasks should have been added while we
@@ -356,14 +407,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dir_sync(dd, tx);
write_time += gethrtime() - start;
- if (spa_sync_pass(dp->dp_spa) == 1)
- dsl_pool_scrub_sync(dp, tx);
-
start = gethrtime();
- if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
- list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- dmu_objset_sync(mosi, zio, tx);
+ dmu_objset_sync(mos, zio, tx);
err = zio_wait(zio);
ASSERT(err == 0);
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
@@ -376,7 +424,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dmu_tx_commit(tx);
- data_written = dp->dp_space_towrite[txg & TXG_MASK];
dp->dp_space_towrite[txg & TXG_MASK] = 0;
ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
@@ -401,10 +448,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* amount of write traffic allowed into each transaction group.
* Weight the throughput calculation towards the current value:
* thru = 3/4 old_thru + 1/4 new_thru
+ *
+ * Note: write_time is in nanosecs, so write_time/MICROSEC
+ * yields millisecs
*/
ASSERT(zfs_write_limit_min > 0);
- if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
- uint64_t throughput = (data_written * NANOSEC) / write_time;
+ if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
+ uint64_t throughput = data_written / (write_time / MICROSEC);
+
if (dp->dp_throughput)
dp->dp_throughput = throughput / 4 +
3 * dp->dp_throughput / 4;
@@ -412,21 +463,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dp->dp_throughput = throughput;
dp->dp_write_limit = MIN(zfs_write_limit_inflated,
MAX(zfs_write_limit_min,
- dp->dp_throughput * zfs_txg_synctime));
+ dp->dp_throughput * zfs_txg_synctime_ms));
}
}
void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
{
dsl_dataset_t *ds;
+ objset_t *os;
while (ds = list_head(&dp->dp_synced_datasets)) {
list_remove(&dp->dp_synced_datasets, ds);
- ASSERT(ds->ds_user_ptr != NULL);
- zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ os = ds->ds_objset;
+ zil_clean(os->os_zil, txg);
+ ASSERT(!dmu_objset_is_dirty(os, txg));
dmu_buf_rele(ds->ds_dbuf, ds);
}
+ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}
/*
@@ -627,6 +681,65 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
tx, DS_FIND_CHILDREN));
}
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj) {
+ dsl_dataset_t *origin;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+ dsl_dataset_rele(origin, FTAG);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t obj;
+
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /*
+ * We can't use bpobj_alloc(), because spa_version() still
+ * returns the old version, and we need a new-version bpobj with
+ * subobj support. So call dmu_object_alloc() directly.
+ */
+ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+ SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+
+ VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+}
+
void
dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
{
@@ -641,7 +754,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+ dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
dp, &dp->dp_origin_snap));
dsl_dataset_rele(ds, FTAG);
@@ -653,3 +766,108 @@ dsl_pool_vnrele_taskq(dsl_pool_t *dp)
{
return (dp->dp_vnrele_taskq);
}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+ zap_attribute_t za;
+ zap_cursor_t zc;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+
+ if (zapobj == 0)
+ return;
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ char *htag;
+ uint64_t dsobj;
+
+ htag = strchr(za.za_name, '-');
+ *htag = '\0';
+ ++htag;
+ dsobj = strtonum(za.za_name, NULL);
+ (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(dp->dp_tmp_userrefs_obj == 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
+ DMU_OT_NONE, 0, tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
+ sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ char *name;
+ int error;
+
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * If the pool was created prior to SPA_VERSION_USERREFS, the
+ * zap object for temporary holds might not exist yet.
+ */
+ if (zapobj == 0) {
+ if (holding) {
+ dsl_pool_user_hold_create_obj(dp, tx);
+ zapobj = dp->dp_tmp_userrefs_obj;
+ } else {
+ return (ENOENT);
+ }
+ }
+
+ name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+ if (holding)
+ error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+ else
+ error = zap_remove(mos, zapobj, name, tx);
+ strfree(name);
+
+ return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ uint64_t *now, dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+ tx, B_FALSE));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
index d06493236805..aa66b32e7938 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -19,10 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
@@ -31,14 +31,16 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include "zfs_prop.h"
+#define ZPROP_INHERIT_SUFFIX "$inherit"
+#define ZPROP_RECVD_SUFFIX "$recvd"
+
static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
+dodefault(const char *propname, int intsz, int numints, void *buf)
{
zfs_prop_t prop;
@@ -55,9 +57,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
if (intsz != 1)
return (EOVERFLOW);
(void) strncpy(buf, zfs_prop_default_string(prop),
- numint);
+ numints);
} else {
- if (intsz != 8 || numint < 1)
+ if (intsz != 8 || numints < 1)
return (EOVERFLOW);
*(uint64_t *)buf = zfs_prop_default_numeric(prop);
@@ -68,11 +70,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
int
dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
- int intsz, int numint, void *buf, char *setpoint)
+ int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
{
int err = ENOENT;
+ dsl_dir_t *target = dd;
objset_t *mos = dd->dd_pool->dp_meta_objset;
zfs_prop_t prop;
+ boolean_t inheritable;
+ boolean_t inheriting = B_FALSE;
+ char *inheritstr;
+ char *recvdstr;
ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
@@ -80,51 +87,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
setpoint[0] = '\0';
prop = zfs_name_to_prop(propname);
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
/*
- * Note: dd may be NULL, therefore we shouldn't dereference it
- * ouside this loop.
+ * Note: dd may become NULL, therefore we shouldn't dereference it
+ * after this loop.
*/
for (; dd != NULL; dd = dd->dd_parent) {
ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
- propname, intsz, numint, buf);
+
+ if (dd != target || snapshot) {
+ if (!inheritable)
+ break;
+ inheriting = B_TRUE;
+ }
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ intsz, numints, buf);
if (err != ENOENT) {
- if (setpoint)
+ if (setpoint != NULL && err == 0)
dsl_dir_name(dd, setpoint);
break;
}
/*
- * Break out of this loop for non-inheritable properties.
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
*/
- if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+ inheritstr);
+ if (err != 0 && err != ENOENT)
break;
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ recvdstr, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0) {
+ if (inheriting) {
+ dsl_dir_name(dd, setpoint);
+ } else {
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ }
+ }
+ break;
+ }
+ }
+
+ /*
+ * If we found an explicit inheritance entry, err is zero even
+ * though we haven't yet found the value, so reinitializing err
+ * at the end of the loop (instead of at the beginning) ensures
+ * that err has a valid post-loop value.
+ */
+ err = ENOENT;
}
+
if (err == ENOENT)
- err = dodefault(propname, intsz, numint, buf);
+ err = dodefault(propname, intsz, numints, buf);
+
+ strfree(inheritstr);
+ strfree(recvdstr);
return (err);
}
int
dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
- int intsz, int numint, void *buf, char *setpoint)
+ int intsz, int numints, void *buf, char *setpoint)
{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t inheritable;
+ boolean_t snapshot;
+ uint64_t zapobj;
+
ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
+ zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+
+ if (zapobj != 0) {
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
- if (ds->ds_phys->ds_props_obj) {
- int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+ ASSERT(snapshot);
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
if (err != ENOENT) {
- if (setpoint)
+ if (setpoint != NULL && err == 0)
dsl_dataset_name(ds, setpoint);
return (err);
}
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ if (inheritable) {
+ char *inheritstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, zapobj, inheritstr);
+ strfree(inheritstr);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ char *recvdstr = kmem_asprintf("%s%s", propname,
+ ZPROP_RECVD_SUFFIX);
+ err = zap_lookup(mos, zapobj, recvdstr,
+ intsz, numints, buf);
+ strfree(recvdstr);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ return (err);
+ }
+ }
}
return (dsl_prop_get_dd(ds->ds_dir, propname,
- intsz, numint, buf, setpoint));
+ intsz, numints, buf, setpoint, snapshot));
}
/*
@@ -168,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr->cbr_func(cbr->cbr_arg, value);
- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
- NULL, cbr, &dd));
if (need_rwlock)
rw_exit(&dp->dp_config_rwlock);
- /* Leave dir open until this callback is unregistered */
return (0);
}
@@ -210,6 +298,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname,
return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
}
+void
+dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+ zprop_source_t source, uint64_t *value)
+{
+ psa->psa_name = propname;
+ psa->psa_source = source;
+ psa->psa_intsz = 8;
+ psa->psa_numints = 1;
+ psa->psa_value = value;
+
+ psa->psa_effective_value = -1ULL;
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+ const char *propname = psa->psa_name;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ zprop_source_t source = psa->psa_source;
+ objset_t *mos;
+ uint64_t zapobj;
+ uint64_t version;
+ char *recvdstr;
+ int err = 0;
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ case ZFS_PROP_REFQUOTA:
+ case ZFS_PROP_REFRESERVATION:
+ break;
+ default:
+ return (-1);
+ }
+
+ mos = dd->dd_pool->dp_meta_objset;
+ zapobj = dd->dd_phys->dd_props_zapobj;
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ version = spa_version(dd->dd_pool->dp_spa);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /* Revert to the received value, if any. */
+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = 0;
+ break;
+ case ZPROP_SRC_LOCAL:
+ psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * If there's no local setting, then the new received value will
+ * be the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * We're clearing the received value, so the local setting (if
+ * it exists) remains the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1,
+ &psa->psa_effective_value);
+ if (err == ENOENT)
+ psa->psa_effective_value = 0;
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
+ }
+
+ strfree(recvdstr);
+
+ if (err == ENOENT)
+ return (0);
+
+ return (err);
+}
+
+#ifdef ZFS_DEBUG
+void
+dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+ zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
+ uint64_t intval;
+ char setpoint[MAXNAMELEN];
+ uint64_t version = spa_version(dd->dd_pool->dp_spa);
+ int err;
+
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ return;
+ }
+ }
+
+ err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
+ setpoint, B_FALSE);
+ if (err == 0 && intval != psa->psa_effective_value) {
+ cmn_err(CE_PANIC, "%s property, source: %x, "
+ "predicted effective value: %llu, "
+ "actual effective value: %llu (setpoint: %s)",
+ psa->psa_name, psa->psa_source,
+ (unsigned long long)psa->psa_effective_value,
+ (unsigned long long)intval, setpoint);
+ }
+}
+#endif
+
/*
* Unregister this callback. Return 0 on success, ENOENT if ddname is
* invalid, ENOMSG if no matching callback registered.
@@ -241,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
- /* Clean up from dsl_prop_register */
- dsl_dir_close(dd, cbr);
return (0);
}
@@ -277,7 +494,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
zap_cursor_t zc;
zap_attribute_t *za;
int err;
- uint64_t dummyval;
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -289,8 +505,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
* If the prop is set here, then this change is not
* being inherited here or below; stop the recursion.
*/
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
- 8, 1, &dummyval);
+ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
if (err == 0) {
dsl_dir_close(dd, FTAG);
return;
@@ -310,8 +525,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
* If the property is set on this ds, then it is not
* inherited here; don't call the callback.
*/
- if (propobj && 0 == zap_lookup(mos, propobj, propname,
- 8, 1, &dummyval))
+ if (propobj && 0 == zap_contains(mos, propobj, propname))
continue;
cbr->cbr_func(cbr->cbr_arg, value);
@@ -331,30 +545,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
dsl_dir_close(dd, FTAG);
}
-struct prop_set_arg {
- const char *name;
- int intsz;
- int numints;
- const void *buf;
-};
-
-
-static void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+void
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- struct prop_set_arg *psa = arg2;
+ dsl_prop_setarg_t *psa = arg2;
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
- uint64_t zapobj, intval;
+ uint64_t zapobj, intval, dummy;
int isint;
char valbuf[32];
- char *valstr;
+ char *valstr = NULL;
+ char *inheritstr;
+ char *recvdstr;
+ char *tbuf = NULL;
+ int err;
+ uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ const char *propname = psa->psa_name;
+ zprop_source_t source = psa->psa_source;
- isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+ isint = (dodefault(propname, 8, 1, &intval) == 0);
- if (dsl_dataset_is_snapshot(ds)) {
- ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
- SPA_VERSION_SNAP_PROPS);
+ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+ ASSERT(version >= SPA_VERSION_SNAP_PROPS);
if (ds->ds_phys->ds_props_obj == 0) {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_props_obj =
@@ -366,22 +578,97 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
}
- if (psa->numints == 0) {
- int err = zap_remove(mos, zapobj, psa->name, tx);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION)
+ return;
+
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /*
+ * revert to received value, if any (inherit -S)
+ * - remove propname
+ * - remove propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ case ZPROP_SRC_LOCAL:
+ /*
+ * remove propname$inherit
+ * set propname -> value
+ */
+ err = zap_remove(mos, zapobj, inheritstr, tx);
ASSERT(err == 0 || err == ENOENT);
- if (isint) {
- VERIFY(0 == dsl_prop_get_ds(ds,
- psa->name, 8, 1, &intval, NULL));
+ VERIFY(0 == zap_update(mos, zapobj, propname,
+ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+ break;
+ case ZPROP_SRC_INHERITED:
+ /*
+ * explicitly inherit
+ * - remove propname
+ * - set propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (version >= SPA_VERSION_RECVD_PROPS &&
+ dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy,
+ NULL) == 0) {
+ dummy = 0;
+ err = zap_update(mos, zapobj, inheritstr,
+ 8, 1, &dummy, tx);
+ ASSERT(err == 0);
}
- } else {
- VERIFY(0 == zap_update(mos, zapobj, psa->name,
- psa->intsz, psa->numints, psa->buf, tx));
- if (isint)
- intval = *(uint64_t *)psa->buf;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * set propname$recvd -> value
+ */
+ err = zap_update(mos, zapobj, recvdstr,
+ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+ ASSERT(err == 0);
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+ /*
+ * clear local and received settings
+ * - remove propname
+ * - remove propname$inherit
+ * - remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ /* FALLTHRU */
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, recvdstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
}
+ strfree(inheritstr);
+ strfree(recvdstr);
+
if (isint) {
- if (dsl_dataset_is_snapshot(ds)) {
+ VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+
+ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
dsl_prop_cb_record_t *cbr;
/*
* It's a snapshot; nothing can inherit this
@@ -392,58 +679,85 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
if (cbr->cbr_ds == ds &&
- strcmp(cbr->cbr_propname, psa->name) == 0)
+ strcmp(cbr->cbr_propname, propname) == 0)
cbr->cbr_func(cbr->cbr_arg, intval);
}
mutex_exit(&ds->ds_dir->dd_lock);
} else {
dsl_prop_changed_notify(ds->ds_dir->dd_pool,
- ds->ds_dir->dd_object, psa->name, intval, TRUE);
+ ds->ds_dir->dd_object, propname, intval, TRUE);
}
- }
- if (isint) {
+
(void) snprintf(valbuf, sizeof (valbuf),
"%lld", (longlong_t)intval);
valstr = valbuf;
} else {
- valstr = (char *)psa->buf;
+ if (source == ZPROP_SRC_LOCAL) {
+ valstr = (char *)psa->psa_value;
+ } else {
+ tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ if (dsl_prop_get_ds(ds, propname, 1,
+ ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+ valstr = tbuf;
+ }
}
- spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
- LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
- "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
+
+ spa_history_log_internal((source == ZPROP_SRC_NONE ||
+ source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
+ LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx,
+ "%s=%s dataset = %llu", propname,
+ (valstr == NULL ? "" : valstr), ds->ds_object);
+
+ if (tbuf != NULL)
+ kmem_free(tbuf, ZAP_MAXVALUELEN);
}
void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- nvlist_t *nvl = arg2;
+ dsl_props_arg_t *pa = arg2;
+ nvlist_t *props = pa->pa_props;
+ dsl_prop_setarg_t psa;
nvpair_t *elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- struct prop_set_arg psa;
+ psa.psa_source = pa->pa_source;
- psa.name = nvpair_name(elem);
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ nvpair_t *pair = elem;
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- VERIFY(nvpair_value_string(elem,
- (char **)&psa.buf) == 0);
- psa.intsz = 1;
- psa.numints = strlen(psa.buf) + 1;
+ psa.psa_name = nvpair_name(pair);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_STRING) {
+ VERIFY(nvpair_value_string(pair,
+ (char **)&psa.psa_value) == 0);
+ psa.psa_intsz = 1;
+ psa.psa_numints = strlen(psa.psa_value) + 1;
} else {
uint64_t intval;
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
- psa.intsz = sizeof (intval);
- psa.numints = 1;
- psa.buf = &intval;
+ VERIFY(nvpair_value_uint64(pair, &intval) == 0);
+ psa.psa_intsz = sizeof (intval);
+ psa.psa_numints = 1;
+ psa.psa_value = &intval;
}
- dsl_prop_set_sync(ds, &psa, cr, tx);
+ dsl_prop_set_sync(ds, &psa, tx);
}
}
void
-dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx)
+dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
@@ -454,18 +768,19 @@ dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
- spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx,
"%s=%llu dataset = %llu", name, (u_longlong_t)val,
dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_prop_set(const char *dsname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
int intsz, int numints, const void *buf)
{
dsl_dataset_t *ds;
+ uint64_t version;
int err;
- struct prop_set_arg psa;
+ dsl_prop_setarg_t psa;
/*
* We must do these checks before we get to the syncfunc, since
@@ -473,23 +788,30 @@ dsl_prop_set(const char *dsname, const char *propname,
*/
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG);
- if (intsz * numints >= ZAP_MAXVALUELEN)
- return (E2BIG);
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (E2BIG);
+ }
if (dsl_dataset_is_snapshot(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
- psa.name = propname;
- psa.intsz = intsz;
- psa.numints = numints;
- psa.buf = buf;
+ psa.psa_name = propname;
+ psa.psa_source = source;
+ psa.psa_intsz = intsz;
+ psa.psa_numints = numints;
+ psa.psa_value = buf;
+ psa.psa_effective_value = -1ULL;
+
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
NULL, dsl_prop_set_sync, ds, &psa, 2);
@@ -498,158 +820,318 @@ dsl_prop_set(const char *dsname, const char *propname,
}
int
-dsl_props_set(const char *dsname, nvlist_t *nvl)
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
{
dsl_dataset_t *ds;
+ uint64_t version;
nvpair_t *elem = NULL;
+ dsl_props_arg_t pa;
int err;
+ if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+ return (err);
/*
* Do these checks before the syncfunc, since it can't fail.
*/
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN)
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+ dsl_dataset_rele(ds, FTAG);
return (ENAMETOOLONG);
+ }
if (nvpair_type(elem) == DATA_TYPE_STRING) {
char *valstr;
VERIFY(nvpair_value_string(elem, &valstr) == 0);
- if (strlen(valstr) >= ZAP_MAXVALUELEN)
+ if (strlen(valstr) >= (version <
+ SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
return (E2BIG);
+ }
}
}
- if (err = dsl_dataset_hold(dsname, FTAG, &ds))
- return (err);
-
if (dsl_dataset_is_snapshot(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
+ pa.pa_props = props;
+ pa.pa_source = source;
+
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- NULL, dsl_props_set_sync, ds, nvl, 2);
+ NULL, dsl_props_set_sync, ds, &pa, 2);
dsl_dataset_rele(ds, FTAG);
return (err);
}
+typedef enum dsl_prop_getflags {
+ DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
+ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
+ DSL_PROP_GET_LOCAL = 0x4, /* local properties */
+ DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+ const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err = 0;
+
+ for (zap_cursor_init(&zc, mos, propobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ char buf[ZAP_MAXNAMELEN];
+ char *valstr;
+ const char *suffix;
+ const char *propname;
+ const char *source;
+
+ suffix = strchr(za.za_name, '$');
+
+ if (suffix == NULL) {
+ /*
+ * Skip local properties if we only want received
+ * properties.
+ */
+ if (flags & DSL_PROP_GET_RECEIVED)
+ continue;
+
+ propname = za.za_name;
+ source = setpoint;
+ } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+ /* Skip explicitly inherited entries. */
+ continue;
+ } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+ if (flags & DSL_PROP_GET_LOCAL)
+ continue;
+
+ (void) strncpy(buf, za.za_name, (suffix - za.za_name));
+ buf[suffix - za.za_name] = '\0';
+ propname = buf;
+
+ if (!(flags & DSL_PROP_GET_RECEIVED)) {
+ /* Skip if locally overridden. */
+ err = zap_contains(mos, propobj, propname);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+
+ /* Skip if explicitly inherited. */
+ valstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, propobj, valstr);
+ strfree(valstr);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+ }
+
+ source = ((flags & DSL_PROP_GET_INHERITING) ?
+ setpoint : ZPROP_SOURCE_VAL_RECVD);
+ } else {
+ /*
+ * For backward compatibility, skip suffixes we don't
+ * recognize.
+ */
+ continue;
+ }
+
+ prop = zfs_name_to_prop(propname);
+
+ /* Skip non-inheritable properties. */
+ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ continue;
+
+ /* Skip properties not valid for this type. */
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+ continue;
+
+ /* Skip properties already defined. */
+ if (nvlist_exists(nv, propname))
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, propobj,
+ za.za_name, 1, za.za_num_integers, tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
/*
* Iterate over all properties for this dataset and return them in an nvlist.
*/
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+ dsl_prop_getflags_t flags)
{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
dsl_dir_t *dd = ds->ds_dir;
- boolean_t snapshot = dsl_dataset_is_snapshot(ds);
- int err = 0;
dsl_pool_t *dp = dd->dd_pool;
objset_t *mos = dp->dp_meta_objset;
- uint64_t propobj = ds->ds_phys->ds_props_obj;
+ int err = 0;
+ char setpoint[MAXNAMELEN];
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if (local && snapshot && !propobj)
- return (0);
+ if (dsl_dataset_is_snapshot(ds))
+ flags |= DSL_PROP_GET_SNAPSHOT;
rw_enter(&dp->dp_config_rwlock, RW_READER);
- while (dd != NULL) {
- char setpoint[MAXNAMELEN];
- zap_cursor_t zc;
- zap_attribute_t za;
- dsl_dir_t *dd_next;
-
- if (propobj) {
- dsl_dataset_name(ds, setpoint);
- dd_next = dd;
- } else {
- dsl_dir_name(dd, setpoint);
- propobj = dd->dd_phys->dd_props_zapobj;
- dd_next = dd->dd_parent;
+
+ if (ds->ds_phys->ds_props_obj != 0) {
+ ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+ dsl_dataset_name(ds, setpoint);
+ err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
+ setpoint, flags, *nvp);
+ if (err)
+ goto out;
+ }
+
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+ if (flags & (DSL_PROP_GET_LOCAL |
+ DSL_PROP_GET_RECEIVED))
+ break;
+ flags |= DSL_PROP_GET_INHERITING;
}
+ dsl_dir_name(dd, setpoint);
+ err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
+ setpoint, flags, *nvp);
+ if (err)
+ break;
+ }
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ return (err);
+}
- for (zap_cursor_init(&zc, mos, propobj);
- (err = zap_cursor_retrieve(&zc, &za)) == 0;
- zap_cursor_advance(&zc)) {
- nvlist_t *propval;
- zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+boolean_t
+dsl_prop_get_hasrecvd(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int rc;
+ uint64_t dummy;
- /* Skip non-inheritable properties. */
- if (prop != ZPROP_INVAL &&
- !zfs_prop_inheritable(prop) &&
- (dd != ds->ds_dir || (snapshot && dd != dd_next)))
- continue;
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+ return (rc == 0);
+}
- /* Skip properties not valid for this type. */
- if (snapshot && prop != ZPROP_INVAL &&
- !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
- continue;
+static void
+dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ uint64_t dummy = 0;
+ dsl_prop_setarg_t psa;
- /* Skip properties already defined */
- if (nvlist_lookup_nvlist(*nvp, za.za_name,
- &propval) == 0)
- continue;
+ if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
+ return;
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
- KM_SLEEP) == 0);
- if (za.za_integer_length == 1) {
- /*
- * String property
- */
- char *tmp = kmem_alloc(za.za_num_integers,
- KM_SLEEP);
- err = zap_lookup(mos, propobj,
- za.za_name, 1, za.za_num_integers, tmp);
- if (err != 0) {
- kmem_free(tmp, za.za_num_integers);
- break;
- }
- VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
- tmp) == 0);
- kmem_free(tmp, za.za_num_integers);
- } else {
- /*
- * Integer property
- */
- ASSERT(za.za_integer_length == 8);
- (void) nvlist_add_uint64(propval, ZPROP_VALUE,
- za.za_first_integer);
- }
+ dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
- VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
- setpoint) == 0);
- VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
- propval) == 0);
- nvlist_free(propval);
- }
- zap_cursor_fini(&zc);
+ (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
+ dsl_prop_set_sync, ds, &psa, 2);
+}
- if (err != ENOENT)
- break;
- err = 0;
- /*
- * If we are just after the props that have been set
- * locally, then we are done after the first iteration.
- */
- if (local)
- break;
- dd = dd_next;
- propobj = 0;
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+void
+dsl_prop_set_hasrecvd(objset_t *os)
+{
+ if (dsl_prop_get_hasrecvd(os)) {
+ ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+ return;
}
- rw_exit(&dp->dp_config_rwlock);
+ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+}
- return (err);
+void
+dsl_prop_unset_hasrecvd(objset_t *os)
+{
+ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+{
+ /*
+ * Received properties are not distinguishable from local properties
+ * until the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+ DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
}
void
dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
{
nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+ uint64_t default_value;
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ /* Indicate the default source if we can. */
+ if (dodefault(propname, 8, 1, &default_value) == 0 &&
+ value == default_value) {
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+ }
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
nvlist_free(propval);
}
@@ -657,9 +1139,15 @@ void
dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
{
nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
nvlist_free(propval);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
new file mode 100644
index 000000000000..56d41083673e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -0,0 +1,1766 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scan_cb_t dsl_scan_defrag_cb;
+static scan_cb_t dsl_scan_scrub_cb;
+static scan_cb_t dsl_scan_remove_cb;
+static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
+int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
+int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
+int zfs_scan_idle = 50; /* idle window in clock ticks */
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ scn->scn_restart_txg);
+ }
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+ if (dp->dp_scan) {
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (EBUSY);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+ pool_scan_func_t *funcp = arg2;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_restart_txg = 0;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ } else {
+ spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ dsl_scan_sync_state(scn, tx);
+
+ spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (complete)
+ scn->scn_phys.scn_state = DSS_FINISHED;
+ else
+ scn->scn_phys.scn_state = DSS_CANCELED;
+
+ spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+ "complete=%u", complete);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ spa->spa_scrub_started = B_FALSE;
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+ if (complete) {
+ spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+ ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return (ENOENT);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+ boolean_t complete = B_FALSE;
+ int err;
+
+ err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+ return (err);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp,
+ const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+ zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+int
+dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read(pio, spa, bpp, pbuf, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+int
+dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read_nolock(pio, spa, bpp, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+ return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+ const zbookmark_t *zb2)
+{
+ uint64_t zb1nextL0, zb2thisobj;
+
+ ASSERT(zb1->zb_objset == zb2->zb_objset);
+ ASSERT(zb2->zb_level == 0);
+
+ /*
+ * A bookmark in the deadlist is considered to be after
+ * everything else.
+ */
+ if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+ return (B_TRUE);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ zb1nextL0 = (zb1->zb_blkid + 1) <<
+ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+ zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t nextobj = zb1nextL0 *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ return (nextobj <= zb2thisobj);
+ }
+
+ if (zb1->zb_object < zb2thisobj)
+ return (B_TRUE);
+ if (zb1->zb_object > zb2thisobj)
+ return (B_FALSE);
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ return (B_FALSE);
+ return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (dsl_dataset_is_snapshot(ds))
+ return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+}
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+{
+ uint64_t elapsed_nanosecs;
+ int mintime;
+
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_pausing)
+ return (B_TRUE); /* we're already pausing */
+
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb && zb->zb_level != 0)
+ return (B_FALSE);
+
+ mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > mintime &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa)) {
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ }
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ scn->scn_pausing = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+ uint64_t objset, uint64_t object, uint64_t blkid)
+{
+ zbookmark_t czb;
+ uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+ if (zfs_no_scrub_prefetch)
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ return;
+
+ SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+ /*
+ * XXX need to make sure all of these arc_read() prefetches are
+ * done before setting xlateall (similar to dsl_read())
+ */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_t *zb)
+{
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for pausing
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+ int err;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ zb->zb_object, zb->zb_blkid * epb + i);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ *bufp, ds, scn, ostype, tx);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+ uint32_t flags = ARC_WAIT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ dnode_phys_t *cdnp;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (j = 0; j < cdnp->dn_nblkptr; j++) {
+ blkptr_t *cbp = &cdnp->dn_blkptr[j];
+ dsl_scan_prefetch(scn, *bufp, cbp,
+ zb->zb_objset, zb->zb_blkid * epb + i, j);
+ }
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ }
+
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = (*bufp)->b_data;
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ /*
+ * We also always visit user/group accounting
+ * objects, and never skip them, even if we are
+ * pausing. This is necessary so that the space
+ * deltas from this txg get integrated.
+ */
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode, *bufp,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode, *bufp,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ }
+
+ return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t object, dmu_tx_t *tx)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(&dnp->dn_spill,
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+ dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ arc_buf_t *buf = NULL;
+ blkptr_t bp_toread = *bp;
+
+ /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+ if (dsl_scan_check_pause(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ if (bp->blk_birth == 0)
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ dprintf_bp(bp,
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ ds, ds ? ds->ds_object : 0,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pbuf, bp);
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return;
+
+ if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+ /*
+ * For non-user-accounting blocks, we need to read the
+ * new bp (from a deleted snapshot, found in
+ * check_existing_xlation). If we used the old bp,
+ * pointers inside this block from before we resumed
+ * would be untranslated.
+ *
+ * For user-accounting blocks, we need to read the old
+ * bp, because we will apply the entire space delta to
+ * it (original untranslated -> translations from
+ * deleted snap -> now).
+ */
+ bp_toread = *bp;
+ }
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+ &buf) != 0)
+ return;
+
+ /*
+ * If dsl_scan_ddt() has aready visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ ASSERT(buf == NULL);
+ return;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ }
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+{
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ if (dsl_dataset_is_snapshot(ds)) {
+ /* Note, scn_cur_{min,max}_txg stays the same. */
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else {
+ zfs_dbgmsg("destroying ds %llu; ignoring",
+ (u_longlong_t)ds->ds_object);
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ }
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, &mintxg) == 0) {
+ int err;
+
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ err = zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+ VERIFY(err == 0 || err == EEXIST);
+ if (err == EEXIST) {
+ /* Both were there to begin with */
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, mintxg, tx));
+ }
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+ dmu_tx_t *tx;
+ uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct enqueue_clones_arg *eca = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (dmu_objset_from_ds(ds, &os))
+ goto out;
+
+ /*
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * snapshots can have ZIL block pointers (which may be the same
+ * BP as in the head), they must be ignored. So we traverse the
+ * ZIL here, rather than in scan_recurse(), because the regular
+ * snapshot block-sharing rules don't apply to it.
+ */
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+ dsl_scan_zil(dp, &os->os_zil_header);
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+ char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "pausing=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_pausing);
+ kmem_free(dsname, ZFS_MAXNAMELEN);
+
+ if (scn->scn_pausing)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ goto out;
+ }
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ }
+ if (ds->ds_phys->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (ds->ds_phys->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == ds->ds_phys->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ VERIFY(zap_join_key(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ } else {
+ struct enqueue_clones_arg eca;
+ eca.tx = tx;
+ eca.originobj = ds->ds_object;
+
+ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde = { 0 };
+ int error;
+ uint64_t n = 0;
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_pause(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+ (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_pausing);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_t zb = { 0 };
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_pausing)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_pausing);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ /*
+ * If we were paused, continue from here. Note if the
+ * ds we were paused on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ /*
+ * In case we were paused right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+ /* keep pulling things out of the zap-object-as-queue */
+ while (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj),
+ zap_cursor_retrieve(&zc, &za) == 0) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ dsobj = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+ /* Set up min/max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (za.za_first_integer != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ za.za_first_integer);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ ds->ds_phys->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ zap_cursor_fini(&zc);
+ if (scn->scn_pausing)
+ return;
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+ uint64_t elapsed_nanosecs;
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa))
+ return (ERESTART);
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ return (used != 0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ int err;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, tx->tx_txg);
+ dsl_scan_setup_sync(scn, &func, tx);
+ }
+
+ if (!dsl_scan_active(scn) ||
+ spa_sync_pass(dp->dp_spa) > 1)
+ return;
+
+ scn->scn_visited_this_txg = 0;
+ scn->scn_pausing = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the free list. If we pause the free, don't do
+ * any scanning. This ensures that there is no free list when
+ * we are scanning, so the scan code doesn't have to worry about
+ * traversing it.
+ */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ dsl_scan_free_cb, scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj txg %llu",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ (longlong_t)tx->tx_txg);
+ scn->scn_visited_this_txg = 0;
+ /*
+ * Re-sync the ddt so that we can further modify
+ * it when doing bprewrite.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err == ERESTART)
+ return;
+ }
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ dsl_scan_visit(scn, tx);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("visited %llu blocks in %llums",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+ if (!scn->scn_pausing) {
+ /* finished with scan. */
+ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ }
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_t *zb)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int zio_priority;
+ int scan_delay = 0;
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg)
+ return (0);
+
+ count_block(dp->dp_blkstats, bp);
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ zio_priority = ZIO_PRIORITY_SCRUB;
+ needs_io = B_TRUE;
+ scan_delay = zfs_scrub_delay;
+ } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ zio_flags |= ZIO_FLAG_RESILVER;
+ zio_priority = ZIO_PRIORITY_RESILVER;
+ needs_io = B_FALSE;
+ scan_delay = zfs_resilver_delay;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vdev_t *vd = vdev_lookup_top(spa,
+ DVA_GET_VDEV(&bp->blk_dva[d]));
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io) {
+ if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ needs_io = B_TRUE;
+ } else {
+ needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+ phys_birth, 1);
+ }
+ }
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If we're seeing recent (zfs_scan_idle) "important" I/Os
+ * then throttle our workload to limit the impact of a scan.
+ */
+ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+ delay(scan_delay);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ dsl_scan_scrub_done, NULL, zio_priority,
+ zio_flags, zb));
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+ dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
deleted file mode 100644
index 50cc069a3a78..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
+++ /dev/null
@@ -1,1060 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
-int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
- NULL,
- dsl_pool_scrub_clean_cb
-};
-
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- enum scrub_func *funcp = arg2;
- dmu_object_type_t ot = 0;
- boolean_t complete = B_FALSE;
-
- dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
- ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
- ASSERT(*funcp > SCRUB_FUNC_NONE);
- ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
- dp->dp_scrub_min_txg = 0;
- dp->dp_scrub_max_txg = tx->tx_txg;
-
- if (*funcp == SCRUB_FUNC_CLEAN) {
- vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
- /* rewrite all disk labels */
- vdev_config_dirty(rvd);
-
- if (vdev_resilver_needed(rvd,
- &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_RESILVER_START);
- dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
- tx->tx_txg);
- } else {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_SCRUB_START);
- }
-
- /* zero out the scrub stats in all vdev_stat_t's */
- vdev_scrub_stat_update(rvd,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
-
- dp->dp_spa->spa_scrub_started = B_TRUE;
- }
-
- /* back to the generic stuff */
-
- if (dp->dp_blkstats == NULL) {
- dp->dp_blkstats =
- kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
- }
- bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
- if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
- ot = DMU_OT_ZAP_OTHER;
-
- dp->dp_scrub_func = *funcp;
- dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
- ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
- dp->dp_scrub_restart = B_FALSE;
- dp->dp_spa->spa_scrub_errors = 0;
-
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &dp->dp_spa->spa_scrub_errors, tx));
-
- spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
- "func=%u mintxg=%llu maxtxg=%llu",
- *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- boolean_t *completep = arg2;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- mutex_enter(&dp->dp_scrub_cancel_lock);
-
- if (dp->dp_scrub_restart) {
- dp->dp_scrub_restart = B_FALSE;
- *completep = B_FALSE;
- }
-
- /* XXX this is scrub-clean specific */
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- while (dp->dp_spa->spa_scrub_inflight > 0) {
- cv_wait(&dp->dp_spa->spa_scrub_io_cv,
- &dp->dp_spa->spa_scrub_lock);
- }
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_started = B_FALSE;
- dp->dp_spa->spa_scrub_active = B_FALSE;
-
- dp->dp_scrub_func = SCRUB_FUNC_NONE;
- VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, tx));
- dp->dp_scrub_queue_obj = 0;
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, tx));
-
- spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
- "complete=%u", *completep);
-
- /* below is scrub-clean specific */
- vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
- *completep);
- /*
- * If the scrub/resilver completed, update all DTLs to reflect this.
- * Whether it succeeded or not, vacate all temporary scrub DTLs.
- */
- vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
- *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
- if (*completep)
- spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
- ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
- spa_errlog_rotate(dp->dp_spa);
-
- /*
- * We may have finished replacing a device.
- * Let the async thread assess this and handle the detach.
- */
- spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
- dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
- boolean_t complete = B_FALSE;
-
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- /*
- * This function will be used by bp-rewrite wad to intercept frees.
- */
- return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
- done, private, arc_flags));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
- return (zb->zb_objset == 0 && zb->zb_object == 0 &&
- zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
-
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb1->zb_object != -1ULL);
- ASSERT(zb2->zb_level == 0);
-
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == -1ULL)
- return (B_TRUE);
-
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
-
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
- if (zb1->zb_object == 0) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
- }
-
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == 0)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
-{
- int elapsed_ticks;
- int mintime;
-
- if (dp->dp_scrub_pausing)
- return (B_TRUE); /* we're already pausing */
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
- return (B_FALSE); /* we're resuming */
-
- /* We only know how to resume from level-0 blocks. */
- if (zb->zb_level != 0)
- return (B_FALSE);
-
- mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
- zfs_scrub_min_time;
- elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
- if (elapsed_ticks > hz * zfs_txg_timeout ||
- (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
- dprintf("pausing at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
- dp->dp_scrub_pausing = B_TRUE;
- dp->dp_scrub_bookmark = *zb;
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
- dsl_pool_t *zta_dp;
- zil_header_t *zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static void
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- /*
- * One block ("stubby") can be allocated a long time ago; we
- * want to visit that one because it has been allocated
- * (on-disk) even if it hasn't been claimed (even though for
- * plain scrub there's nothing to do to it).
- */
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
- return;
-
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
- if (lrc->lrc_txtype == TX_WRITE) {
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- /*
- * birth can be < claim_txg if this record's txg is
- * already txg sync'ed (but this log block contains
- * other records that are not synced)
- */
- if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
-
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
- }
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
- uint64_t claim_txg = zh->zh_claim_txg;
- zil_traverse_arg_t zta = { dp, zh };
- zilog_t *zilog;
-
- /*
- * We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
- */
- if (claim_txg == 0 && spa_writeable(dp->dp_spa))
- return;
-
- zilog = zil_alloc(dp->dp_meta_objset, zh);
-
- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
- claim_txg);
-
- zil_free(zilog);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
- arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
- int err;
- arc_buf_t *buf = NULL;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- if (scrub_pause(dp, zb))
- return;
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
- /*
- * If we already visited this bp & everything below (in
- * a prior txg), don't bother doing it again.
- */
- if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
- return;
-
- /*
- * If we found the block we're trying to resume from, or
- * we went past it to a different object, zero it out to
- * indicate that it's OK to start checking for pausing
- * again.
- */
- if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
- zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
- dprintf("resuming at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset,
- (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level,
- (longlong_t)zb->zb_blkid);
- bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
- }
- }
-
- if (BP_GET_LEVEL(bp) > 0) {
- uint32_t flags = ARC_WAIT;
- int i;
- blkptr_t *cbp;
- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- cbp = buf->b_data;
-
- for (i = 0; i < epb; i++, cbp++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
- zb->zb_level - 1,
- zb->zb_blkid * epb + i);
- scrub_visitbp(dp, dnp, buf, cbp, &czb);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
- uint32_t flags = ARC_WAIT;
- dnode_phys_t *child_dnp;
- int i;
- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- child_dnp = buf->b_data;
-
- for (i = 0; i < epb; i++, child_dnp++) {
- scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
- zb->zb_blkid * epb + i);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- uint32_t flags = ARC_WAIT;
- objset_phys_t *osp;
-
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
-
- osp = buf->b_data;
-
- traverse_zil(dp, &osp->os_zil_header);
-
- scrub_visitdnode(dp, &osp->os_meta_dnode,
- buf, zb->zb_objset, 0);
- if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- scrub_visitdnode(dp, &osp->os_userused_dnode,
- buf, zb->zb_objset, 0);
- scrub_visitdnode(dp, &osp->os_groupused_dnode,
- buf, zb->zb_objset, 0);
- }
- }
-
- (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
- if (buf)
- (void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object)
-{
- int j;
-
- for (j = 0; j < dnp->dn_nblkptr; j++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
- scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
- }
-
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
- zbookmark_t zb;
-
- SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
- scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) != 0) {
- return;
- }
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- dp->dp_scrub_bookmark.zb_objset =
- ds->ds_phys->ds_prev_snap_obj;
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_prev_snap_obj, tx) == 0);
- }
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
- } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
- }
-
- if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds1->ds_object, tx) == 0) {
- int err = zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds2->ds_object, tx);
- VERIFY(err == 0 || err == EEXIST);
- if (err == EEXIST) {
- /* Both were there to begin with */
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds2->ds_object, tx) == 0) {
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
-}
-
-struct enqueue_clones_arg {
- dmu_tx_t *tx;
- uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- struct enqueue_clones_arg *eca = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
- dp = ds->ds_dir->dd_pool;
-
- if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
- dsl_dataset_rele(ds, FTAG);
- if (err)
- return (err);
- ds = prev;
- }
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, eca->tx) == 0);
- }
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
- uint64_t min_txg_save;
-
- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
- /*
- * Iterate over the bps in this ds.
- */
- min_txg_save = dp->dp_scrub_min_txg;
- dp->dp_scrub_min_txg =
- MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
- scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
- dp->dp_scrub_min_txg = min_txg_save;
-
- if (dp->dp_scrub_pausing)
- goto out;
-
- /*
- * Add descendent datasets to work queue.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- if (ds->ds_phys->ds_num_children > 1) {
- boolean_t usenext = B_FALSE;
- if (ds->ds_phys->ds_next_clones_obj != 0) {
- uint64_t count;
- /*
- * A bug in a previous version of the code could
- * cause upgrade_clones_cb() to not set
- * ds_next_snap_obj when it should, leading to a
- * missing entry. Therefore we can only use the
- * next_clones_obj when its count is correct.
- */
- int err = zap_count(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj, &count);
- if (err == 0 &&
- count == ds->ds_phys->ds_num_children - 1)
- usenext = B_TRUE;
- }
-
- if (usenext) {
- VERIFY(zap_join(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj,
- dp->dp_scrub_queue_obj, tx) == 0);
- } else {
- struct enqueue_clones_arg eca;
- eca.tx = tx;
- eca.originobj = ds->ds_object;
-
- (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
- NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
- }
- }
-
-out:
- dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- dmu_tx_t *tx = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
-
- dp = ds->ds_dir->dd_pool;
-
- while (ds->ds_phys->ds_prev_snap_obj != 0) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- FTAG, &prev);
- if (err) {
- dsl_dataset_rele(ds, FTAG);
- return (err);
- }
-
- /*
- * If this is a clone, we don't need to worry about it for now.
- */
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_rele(ds, FTAG);
- dsl_dataset_rele(prev, FTAG);
- return (0);
- }
- dsl_dataset_rele(ds, FTAG);
- ds = prev;
- }
-
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0);
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
- spa_t *spa = dp->dp_spa;
- zap_cursor_t zc;
- zap_attribute_t za;
- boolean_t complete = B_TRUE;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- /*
- * If the pool is not loaded, or is trying to unload, leave it alone.
- */
- if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
- return;
-
- if (dp->dp_scrub_restart) {
- enum scrub_func func = dp->dp_scrub_func;
- dp->dp_scrub_restart = B_FALSE;
- dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
- }
-
- if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
- /*
- * We must have resumed after rebooting; reset the vdev
- * stats to know that we're doing a scrub (although it
- * will think we're just starting now).
- */
- vdev_scrub_stat_update(spa->spa_root_vdev,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
- }
-
- dp->dp_scrub_pausing = B_FALSE;
- dp->dp_scrub_start_time = lbolt64;
- dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
- spa->spa_scrub_active = B_TRUE;
-
- if (dp->dp_scrub_bookmark.zb_objset == 0) {
- /* First do the MOS & ORIGIN */
- scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
- if (dp->dp_scrub_pausing)
- goto out;
-
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- VERIFY(0 == dmu_objset_find_spa(spa,
- NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
- } else {
- scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
- }
- ASSERT(!dp->dp_scrub_pausing);
- } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
- /*
- * If we were paused, continue from here. Note if the
- * ds we were paused on was deleted, the zb_objset will
- * be -1, so we will skip this and find a new objset
- * below.
- */
- scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
- if (dp->dp_scrub_pausing)
- goto out;
- }
-
- /*
- * In case we were paused right at the end of the ds, zero the
- * bookmark so we don't think that we're still trying to resume.
- */
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
- /* keep pulling things out of the zap-object-as-queue */
- while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
- zap_cursor_retrieve(&zc, &za) == 0) {
- VERIFY(0 == zap_remove(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, za.za_name, tx));
- scrub_visitds(dp, za.za_first_integer, tx);
- if (dp->dp_scrub_pausing)
- break;
- zap_cursor_fini(&zc);
- }
- zap_cursor_fini(&zc);
- if (dp->dp_scrub_pausing)
- goto out;
-
- /* done. */
-
- dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
- return;
-out:
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors, tx));
-
- /* XXX this is scrub-clean specific */
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > 0)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
- mutex_enter(&dp->dp_scrub_cancel_lock);
- dp->dp_scrub_restart = B_TRUE;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
- int i;
-
- /*
- * If we resume after a reboot, zab will be NULL; don't record
- * incomplete stats in that case.
- */
- if (zab == NULL)
- return;
-
- for (i = 0; i < 4; i++) {
- int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
- int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
- zfs_blkstat_t *zb = &zab->zab_type[l][t];
- int equal;
-
- zb->zb_count++;
- zb->zb_asize += BP_GET_ASIZE(bp);
- zb->zb_lsize += BP_GET_LSIZE(bp);
- zb->zb_psize += BP_GET_PSIZE(bp);
- zb->zb_gangs += BP_COUNT_GANG(bp);
-
- switch (BP_GET_NDVAS(bp)) {
- case 2:
- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1]))
- zb->zb_ditto_2_of_2_samevdev++;
- break;
- case 3:
- equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1])) +
- (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[2])) +
- (DVA_GET_VDEV(&bp->blk_dva[1]) ==
- DVA_GET_VDEV(&bp->blk_dva[2]));
- if (equal == 1)
- zb->zb_ditto_2_of_3_samevdev++;
- else if (equal == 3)
- zb->zb_ditto_3_of_3_samevdev++;
- break;
- }
- }
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
- spa_t *spa = zio->io_spa;
-
- zio_data_buf_free(zio->io_data, zio->io_size);
-
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_inflight--;
- cv_broadcast(&spa->spa_scrub_io_cv);
-
- if (zio->io_error && (zio->io_error != ECKSUM ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
- spa->spa_scrub_errors++;
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
- const blkptr_t *bp, const zbookmark_t *zb)
-{
- size_t size = BP_GET_PSIZE(bp);
- spa_t *spa = dp->dp_spa;
- boolean_t needs_io;
- int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- int zio_priority;
-
- ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
-
- if (bp->blk_birth >= dp->dp_scrub_max_txg)
- return (0);
-
- count_block(dp->dp_blkstats, bp);
-
- if (dp->dp_scrub_isresilver == 0) {
- /* It's a scrub */
- zio_flags |= ZIO_FLAG_SCRUB;
- zio_priority = ZIO_PRIORITY_SCRUB;
- needs_io = B_TRUE;
- } else {
- /* It's a resilver */
- zio_flags |= ZIO_FLAG_RESILVER;
- zio_priority = ZIO_PRIORITY_RESILVER;
- needs_io = B_FALSE;
- }
-
- /* If it's an intent log block, failure is expected. */
- if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
- zio_flags |= ZIO_FLAG_SPECULATIVE;
-
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
- vdev_t *vd = vdev_lookup_top(spa,
- DVA_GET_VDEV(&bp->blk_dva[d]));
-
- /*
- * Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
- */
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_examined +=
- DVA_GET_ASIZE(&bp->blk_dva[d]);
- mutex_exit(&vd->vdev_stat_lock);
-
- /* if it's a resilver, this may not be in the target range */
- if (!needs_io) {
- if (DVA_GET_GANG(&bp->blk_dva[d])) {
- /*
- * Gang members may be spread across multiple
- * vdevs, so the best estimate we have is the
- * scrub range, which has already been checked.
- * XXX -- it would be better to change our
- * allocation policy to ensure that all
- * gang members reside on the same vdev.
- */
- needs_io = B_TRUE;
- } else {
- needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
- bp->blk_birth, 1);
- }
- }
- }
-
- if (needs_io && !zfs_no_scrub_io) {
- void *data = zio_data_buf_alloc(size);
-
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_inflight++;
- mutex_exit(&spa->spa_scrub_lock);
-
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_pool_scrub_clean_done, NULL, zio_priority,
- zio_flags, zb));
- }
-
- /* do not relocate this block */
- return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
- spa_t *spa = dp->dp_spa;
-
- /*
- * Purge all vdev caches. We do this here rather than in sync
- * context because this requires a writer lock on the spa_config
- * lock, which we can't do from sync context. The
- * spa_scrub_reopen flag indicates that vdev_open() should not
- * attempt to start another scrub.
- */
- spa_vdev_state_enter(spa);
- spa->spa_scrub_reopen = B_TRUE;
- vdev_reopen(spa->spa_root_vdev);
- spa->spa_scrub_reopen = B_FALSE;
- (void) spa_vdev_state_exit(spa, NULL, 0);
-
- return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
index 21100225abf7..b0818ce274d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -19,18 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
-#include <sys/cred.h>
+#include <sys/metaslab.h>
#define DST_AVG_BLKSHIFT 14
@@ -50,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
offsetof(dsl_sync_task_t, dst_node));
dstg->dstg_pool = dp;
- dstg->dstg_cr = CRED();
return (dstg);
}
@@ -112,14 +108,21 @@ top:
return (dstg->dstg_err);
}
- VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+ /*
+ * We don't generally have many sync tasks, so pay the price of
+ * add_tail to get the tasks executed in the right order.
+ */
+ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+ dstg, txg));
dmu_tx_commit(tx);
txg_wait_synced(dstg->dstg_pool, txg);
- if (dstg->dstg_err == EAGAIN)
+ if (dstg->dstg_err == EAGAIN) {
+ txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
goto top;
+ }
return (dstg->dstg_err);
}
@@ -131,7 +134,12 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
dstg->dstg_nowaiter = B_TRUE;
txg = dmu_tx_get_txg(tx);
- VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+ /*
+ * We don't generally have many sync tasks, so pay the price of
+ * add_tail to get the tasks executed in the right order.
+ */
+ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+ dstg, txg));
}
void
@@ -150,25 +158,30 @@ void
dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
{
dsl_sync_task_t *dst;
- void *tr_cookie;
+ dsl_pool_t *dp = dstg->dstg_pool;
+ uint64_t quota, used;
ASSERT3U(dstg->dstg_err, ==, 0);
/*
- * Check for sufficient space.
+ * Check for sufficient space. We just check against what's
+ * on-disk; we don't want any in-flight accounting to get in our
+ * way, because open context may have already used up various
+ * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
*/
- dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
- dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
- /* don't bother trying again */
- if (dstg->dstg_err == ERESTART)
- dstg->dstg_err = EAGAIN;
- if (dstg->dstg_err)
+ quota = dsl_pool_adjustedsize(dp, B_FALSE) -
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ used = dp->dp_root_dir->dd_phys->dd_used_bytes;
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) {
+ dstg->dstg_err = ENOSPC;
return;
+ }
/*
* Check for errors by calling checkfuncs.
*/
- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
dst->dst_err =
@@ -183,13 +196,10 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
*/
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
- dstg->dstg_cr, tx);
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
}
}
- rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
- dsl_dir_tempreserve_clear(tr_cookie, tx);
+ rw_exit(&dp->dp_config_rwlock);
if (dstg->dstg_nowaiter)
dsl_sync_task_group_destroy(dstg);
@@ -203,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp,
dsl_sync_task_group_t *dstg;
int err;
+ ASSERT(spa_writeable(dp->dp_spa));
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
@@ -218,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp,
{
dsl_sync_task_group_t *dstg;
+ if (!spa_writeable(dp->dp_spa))
+ return;
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
deleted file mode 100644
index 54247d724d49..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Fletcher Checksums
- * ------------------
- *
- * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
- * recurrence relations:
- *
- * a = a + f
- * i i-1 i-1
- *
- * b = b + a
- * i i-1 i
- *
- * c = c + b (fletcher-4 only)
- * i i-1 i
- *
- * d = d + c (fletcher-4 only)
- * i i-1 i
- *
- * Where
- * a_0 = b_0 = c_0 = d_0 = 0
- * and
- * f_0 .. f_(n-1) are the input data.
- *
- * Using standard techniques, these translate into the following series:
- *
- * __n_ __n_
- * \ | \ |
- * a = > f b = > i * f
- * n /___| n - i n /___| n - i
- * i = 1 i = 1
- *
- *
- * __n_ __n_
- * \ | i*(i+1) \ | i*(i+1)*(i+2)
- * c = > ------- f d = > ------------- f
- * n /___| 2 n - i n /___| 6 n - i
- * i = 1 i = 1
- *
- * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
- * Since the additions are done mod (2^64), errors in the high bits may not
- * be noticed. For this reason, fletcher-2 is deprecated.
- *
- * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
- * A conservative estimate of how big the buffer can get before we overflow
- * can be estimated using f_i = 0xffffffff for all i:
- *
- * % bc
- * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
- * 2264
- * quit
- * %
- *
- * So blocks of up to 2k will not overflow. Our largest block size is
- * 128k, which has 32k 4-byte words, so we can compute the largest possible
- * accumulators, then divide by 2^64 to figure the max amount of overflow:
- *
- * % bc
- * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
- * a/2^64;b/2^64;c/2^64;d/2^64
- * 0
- * 0
- * 1365
- * 11186858
- * quit
- * %
- *
- * So a and b cannot overflow. To make sure each bit of input has some
- * effect on the contents of c and d, we can look at what the factors of
- * the coefficients in the equations for c_n and d_n are. The number of 2s
- * in the factors determines the lowest set bit in the multiplier. Running
- * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
- * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
- * the 64-bit accumulators, every bit of every f_i effects every accumulator,
- * even for 128k blocks.
- *
- * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
- * we could do our calculations mod (2^32 - 1) by adding in the carries
- * periodically, and store the number of carries in the top 32-bits.
- *
- * --------------------
- * Checksum Performance
- * --------------------
- *
- * There are two interesting components to checksum performance: cached and
- * uncached performance. With cached data, fletcher-2 is about four times
- * faster than fletcher-4. With uncached data, the performance difference is
- * negligible, since the cost of a cache fill dominates the processing time.
- * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
- * efficient pass over the data.
- *
- * In normal operation, the data which is being checksummed is in a buffer
- * which has been filled either by:
- *
- * 1. a compression step, which will be mostly cached, or
- * 2. a bcopy() or copyin(), which will be uncached (because the
- * copy is cache-bypassing).
- *
- * For both cached and uncached data, both fletcher checksums are much faster
- * than sha-256, and slower than 'off', which doesn't touch the data at all.
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/byteorder.h>
-#include <sys/spa.h>
-
-void
-fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint64_t *ip = buf;
- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
- uint64_t a0, b0, a1, b1;
-
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
- a0 += ip[0];
- a1 += ip[1];
- b0 += a0;
- b1 += a1;
- }
-
- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint64_t *ip = buf;
- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
- uint64_t a0, b0, a1, b1;
-
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
- a0 += BSWAP_64(ip[0]);
- a1 += BSWAP_64(ip[1]);
- b0 += a0;
- b1 += a1;
- }
-
- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += ip[0];
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += BSWAP_32(ip[0]);
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- a = zcp->zc_word[0];
- b = zcp->zc_word[1];
- c = zcp->zc_word[2];
- d = zcp->zc_word[3];
-
- for (; ip < ipend; ip++) {
- a += ip[0];
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- a = zcp->zc_word[0];
- b = zcp->zc_word[1];
- c = zcp->zc_word[2];
- d = zcp->zc_word[3];
-
- for (; ip < ipend; ip++) {
- a += BSWAP_32(ip[0]);
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
index a88b85c7ec25..a2d9dab88440 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
@@ -20,21 +20,20 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
- * We keep our own copy of this algorithm for 2 main reasons:
- * 1. If we didn't, anyone modifying common/os/compress.c would
+ * We keep our own copy of this algorithm for 3 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
* directly break our on disk format
- * 2. Our version of lzjb does not have a number of checks that the
+ * 2. Our version of lzjb does not have a number of checks that the
* common/os version needs and uses
+ * 3. We initialize the lempel to ensure deterministic results,
+ * so that identical blocks can always be deduplicated.
* In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
*/
#include <sys/zfs_context.h>
@@ -44,7 +43,7 @@
#define MATCH_MIN 3
#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
-#define LEMPEL_SIZE 256
+#define LEMPEL_SIZE 1024
/*ARGSUSED*/
size_t
@@ -54,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
uchar_t *dst = d_start;
uchar_t *cpy, *copymap;
int copymask = 1 << (NBBY - 1);
- int mlen, offset;
+ int mlen, offset, hash;
uint16_t *hp;
- uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+ uint16_t lempel[LEMPEL_SIZE] = { 0 };
while (src < (uchar_t *)s_start + s_len) {
if ((copymask <<= 1) == (1 << NBBY)) {
- if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
- if (d_len != s_len)
- return (s_len);
- mlen = s_len;
- for (src = s_start, dst = d_start; mlen; mlen--)
- *dst++ = *src++;
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
return (s_len);
- }
copymask = 1;
copymap = dst;
*dst++ = 0;
@@ -76,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
*dst++ = *src++;
continue;
}
- hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
- (LEMPEL_SIZE - 1)];
+ hash = (src[0] << 16) + (src[1] << 8) + src[2];
+ hash += hash >> 9;
+ hash += hash >> 5;
+ hp = &lempel[hash & (LEMPEL_SIZE - 1)];
offset = (intptr_t)(src - *hp) & OFFSET_MASK;
*hp = (uint16_t)(uintptr_t)src;
cpy = src - offset;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index c5ce27cb677c..17b4b12c4ee4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -23,7 +23,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/space_map.h>
@@ -35,6 +34,11 @@ uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
@@ -72,12 +76,13 @@ int metaslab_smo_bonus_pct = 150;
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+ mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
@@ -87,58 +92,73 @@ metaslab_class_create(space_map_ops_t *ops)
void
metaslab_class_destroy(metaslab_class_t *mc)
{
- metaslab_group_t *mg;
-
- while ((mg = mc->mc_rotor) != NULL) {
- metaslab_class_remove(mc, mg);
- metaslab_group_destroy(mg);
- }
+ ASSERT(mc->mc_rotor == NULL);
+ ASSERT(mc->mc_alloc == 0);
+ ASSERT(mc->mc_deferred == 0);
+ ASSERT(mc->mc_space == 0);
+ ASSERT(mc->mc_dspace == 0);
kmem_free(mc, sizeof (metaslab_class_t));
}
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+int
+metaslab_class_validate(metaslab_class_t *mc)
{
- metaslab_group_t *mgprev, *mgnext;
+ metaslab_group_t *mg;
+ vdev_t *vd;
- ASSERT(mg->mg_class == NULL);
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
- if ((mgprev = mc->mc_rotor) == NULL) {
- mg->mg_prev = mg;
- mg->mg_next = mg;
- } else {
- mgnext = mgprev->mg_next;
- mg->mg_prev = mgprev;
- mg->mg_next = mgnext;
- mgprev->mg_next = mg;
- mgnext->mg_prev = mg;
- }
- mc->mc_rotor = mg;
- mg->mg_class = mc;
+ if ((mg = mc->mc_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ return (0);
}
void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
- metaslab_group_t *mgprev, *mgnext;
+ atomic_add_64(&mc->mc_alloc, alloc_delta);
+ atomic_add_64(&mc->mc_deferred, defer_delta);
+ atomic_add_64(&mc->mc_space, space_delta);
+ atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
- ASSERT(mg->mg_class == mc);
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+ return (mc->mc_alloc);
+}
- mgprev = mg->mg_prev;
- mgnext = mg->mg_next;
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+ return (mc->mc_deferred);
+}
- if (mg == mgnext) {
- mc->mc_rotor = NULL;
- } else {
- mc->mc_rotor = mgnext;
- mgprev->mg_next = mgnext;
- mgnext->mg_prev = mgprev;
- }
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+ return (mc->mc_space);
+}
- mg->mg_prev = NULL;
- mg->mg_next = NULL;
- mg->mg_class = NULL;
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
}
/*
@@ -179,9 +199,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
- mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
mg->mg_vd = vd;
- metaslab_class_add(mc, mg);
+ mg->mg_class = mc;
+ mg->mg_activation_count = 0;
return (mg);
}
@@ -189,11 +209,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
void
metaslab_group_destroy(metaslab_group_t *mg)
{
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ /*
+ * We may have gone below zero with the activation count
+ * either because we never activated in the first place or
+ * because we're done, and possibly removing the vdev.
+ */
+ ASSERT(mg->mg_activation_count <= 0);
+
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
kmem_free(mg, sizeof (metaslab_group_t));
}
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count <= 0);
+
+ if (++mg->mg_activation_count <= 0)
+ return;
+
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+}
+
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+ if (--mg->mg_activation_count != 0) {
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count < 0);
+ return;
+ }
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+}
+
static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{
@@ -611,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
metaslab_group_add(mg, msp);
+ if (metaslab_debug && smo->smo_object != 0) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+ SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+ mutex_exit(&msp->ms_lock);
+ }
+
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
@@ -621,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
metaslab_sync_done(msp, 0);
if (txg != 0) {
- /*
- * The vdev is dirty, but the metaslab isn't -- it just needs
- * to have metaslab_sync_done() invoked from vdev_sync_done().
- * [We could just dirty the metaslab, but that would cause us
- * to allocate a space map object for it, which is wasteful
- * and would mess up the locality logic in metaslab_weight().]
- */
- ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
vdev_dirty(vd, 0, NULL, txg);
- vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
}
return (msp);
@@ -640,10 +730,9 @@ void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
- int t;
- vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc, B_TRUE);
+ vdev_space_update(mg->mg_vd,
+ -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
metaslab_group_remove(mg, msp);
@@ -652,11 +741,16 @@ metaslab_fini(metaslab_t *msp)
space_map_unload(&msp->ms_map);
space_map_destroy(&msp->ms_map);
- for (t = 0; t < TXG_SIZE; t++) {
+ for (int t = 0; t < TXG_SIZE; t++) {
space_map_destroy(&msp->ms_allocmap[t]);
space_map_destroy(&msp->ms_freemap[t]);
}
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_destroy(&msp->ms_defermap[t]);
+
+ ASSERT3S(msp->ms_deferspace, ==, 0);
+
mutex_exit(&msp->ms_lock);
mutex_destroy(&msp->ms_lock);
@@ -741,7 +835,7 @@ metaslab_prefetch(metaslab_group_t *mg)
if (!sm->sm_loaded && smo->smo_object != 0) {
mutex_exit(&mg->mg_lock);
- dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
+ dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
0ULL, smo->smo_objsize);
mutex_enter(&mg->mg_lock);
}
@@ -759,11 +853,19 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
- msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
- if (error) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
+ space_map_load_wait(sm);
+ if (!sm->sm_loaded) {
+ int error = space_map_load(sm, sm_ops, SM_FREE,
+ &msp->ms_smo,
+ spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_claim, sm);
+
}
/*
@@ -812,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
{
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
+ objset_t *mos = spa_meta_objset(spa);
space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -820,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_obj_t *smo = &msp->ms_smo_syncing;
dmu_buf_t *db;
dmu_tx_t *tx;
- int t;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ ASSERT(!vd->vdev_ishole);
+
+ if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+ return;
/*
* The only state that can actually be changing concurrently with
@@ -832,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* We drop it whenever we call into the DMU, because the DMU
* can call down to us (e.g. via zio_free()) at any time.
*/
- mutex_enter(&msp->ms_lock);
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (smo->smo_object == 0) {
ASSERT(smo->smo_objsize == 0);
ASSERT(smo->smo_alloc == 0);
- mutex_exit(&msp->ms_lock);
smo->smo_object = dmu_object_alloc(mos,
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
@@ -845,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
(sm->sm_start >> vd->vdev_ms_shift),
sizeof (uint64_t), &smo->smo_object, tx);
- mutex_enter(&msp->ms_lock);
}
+ mutex_enter(&msp->ms_lock);
+
space_map_walk(freemap, space_map_add, freed_map);
if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
@@ -860,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* This metaslab is 100% allocated,
* minus the content of the in-core map (sm),
* minus what's been freed this txg (freed_map),
+ * minus deferred frees (ms_defermap[]),
* minus allocations from txgs in the future
* (because they haven't been committed yet).
*/
@@ -871,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_walk(sm, space_map_remove, allocmap);
space_map_walk(freed_map, space_map_remove, allocmap);
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_remove, allocmap);
+
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
space_map_remove, allocmap);
@@ -905,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
space_map_obj_t *smosync = &msp->ms_smo_syncing;
space_map_t *sm = &msp->ms_map;
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
- int t;
+ int64_t alloc_delta, defer_delta;
+
+ ASSERT(!vd->vdev_ishole);
mutex_enter(&msp->ms_lock);
@@ -916,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* allocmaps and freemaps and add its capacity to the vdev.
*/
if (freed_map->sm_size == 0) {
- for (t = 0; t < TXG_SIZE; t++) {
+ for (int t = 0; t < TXG_SIZE; t++) {
space_map_create(&msp->ms_allocmap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_create(&msp->ms_defermap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+ vdev_space_update(vd, 0, 0, sm->sm_size);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+ alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+ defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+ vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -933,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
/*
* If there's a space_map_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
- * Then, add everything we freed in this txg to the map.
+ * Then, add defer_map (oldest deferred frees) to this map and
+ * transfer freed_map (this txg's frees) to defer_map.
*/
space_map_load_wait(sm);
- space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+ space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+ space_map_vacate(freed_map, space_map_add, defer_map);
*smo = *smosync;
+ msp->ms_deferspace += defer_delta;
+ ASSERT3S(msp->ms_deferspace, >=, 0);
+ ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+ if (msp->ms_deferspace != 0) {
+ /*
+ * Keep syncing this metaslab until all deferred frees
+ * are back in circulation.
+ */
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ }
+
/*
* If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then
@@ -948,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int evictable = 1;
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
evictable = 0;
- if (evictable)
+ if (evictable && !metaslab_debug)
space_map_unload(sm);
}
@@ -1119,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
- if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
+ if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
- * Note that there's no locking on mc_rotor or mc_allocated because
+ * Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
@@ -1146,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (flags & METASLAB_HINTBP_AVOID)
- mg = vd->vdev_mg->mg_next;
- else
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists (i.e. removed). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL) {
mg = vd->vdev_mg;
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
@@ -1158,15 +1303,18 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
}
/*
- * If the hint put us into the wrong class, just follow the rotor.
+ * If the hint put us into the wrong metaslab class, or into a
+ * metaslab group that has been passivated, just follow the rotor.
*/
- if (mg->mg_class != mc)
+ if (mg->mg_class != mc || mg->mg_activation_count <= 0)
mg = mc->mc_rotor;
rotor = mg;
top:
all_zero = B_TRUE;
do {
+ ASSERT(mg->mg_activation_count == 1);
+
vd = mg->mg_vd;
/*
@@ -1211,32 +1359,28 @@ top:
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
- if (mc->mc_allocated == 0) {
+ if (mc->mc_aliquot == 0) {
vdev_stat_t *vs = &vd->vdev_stat;
- uint64_t alloc, space;
- int64_t vu, su;
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
+ int64_t vu, cu;
/*
* Determine percent used in units of 0..1024.
* (This is just to avoid floating point.)
*/
vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
- su = (alloc << 10) / (space + 1);
+ cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
/*
* Bias by at most +/- 25% of the aliquot.
*/
- mg->mg_bias = ((su - vu) *
+ mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / (1024 * 4);
}
- if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
}
DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -1248,7 +1392,7 @@ top:
}
next:
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
if (!all_zero) {
@@ -1328,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
uint64_t size = DVA_GET_ASIZE(dva);
vdev_t *vd;
metaslab_t *msp;
- int error;
+ int error = 0;
ASSERT(DVA_IS_VALID(dva));
@@ -1343,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+ if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+ error = ENOENT;
+
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
@@ -1371,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int error = 0;
ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -1400,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
spa_config_exit(spa, SCL_ALLOC, FTAG);
- bp->blk_birth = txg;
+ BP_SET_BIRTH(bp, txg, txg);
return (0);
}
@@ -1412,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
index 5fe4e638055a..6d8e2f221425 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -19,16 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/refcount.h>
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
#ifdef _KERNEL
int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -192,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder)
return (refcount_remove_many(rc, 1, holder));
}
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+#endif /* ZFS_DEBUG */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
new file mode 100644
index 000000000000..4db13fd917c3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -0,0 +1,1970 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode. The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations. The spill block will be sized to fit the data
+ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer. Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function. The names are only used
+ * during registration. All attributes are known by their unique attribute
+ * id value. If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually. Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout. The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill). If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used. This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues. The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes. Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once. sa_replace_all_by_template() is
+ * used to set an array of attributes. This is used by the ZPL when
+ * creating a brand new file. The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing. The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping. Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available. Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+ uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+ void *data);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+ int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+ byteswap_uint64_array,
+ byteswap_uint32_array,
+ byteswap_uint16_array,
+ byteswap_uint8_array,
+ zfs_acl_byteswap,
+};
+
+#define SA_COPY_DATA(f, s, t, l) \
+ { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else \
+ sa_copy_data(f, s, t, l); \
+ }
+
+/*
+ * This table is fixed and cannot be changed. Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes. These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * ZPL legacy layout
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+ sa_handle_t *hdl = buf;
+
+ hdl->sa_bonus_tab = NULL;
+ hdl->sa_spill_tab = NULL;
+ hdl->sa_os = NULL;
+ hdl->sa_userp = NULL;
+ hdl->sa_bonus = NULL;
+ hdl->sa_spill = NULL;
+ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+ sa_handle_t *hdl = buf;
+ mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+ sa_cache = kmem_cache_create("sa_cache",
+ sizeof (sa_handle_t), 0, sa_cache_constructor,
+ sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+ if (sa_cache)
+ kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_num > node2->lot_num)
+ return (1);
+ else if (node1->lot_num < node2->lot_num)
+ return (-1);
+ return (0);
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_hash > node2->lot_hash)
+ return (1);
+ if (node1->lot_hash < node2->lot_hash)
+ return (-1);
+ if (node1->lot_instance > node2->lot_instance)
+ return (1);
+ if (node1->lot_instance < node2->lot_instance)
+ return (-1);
+ return (0);
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+ int i;
+
+ if (count != tbf->lot_attr_count)
+ return (1);
+
+ for (i = 0; i != count; i++) {
+ if (attrs[i] != tbf->lot_attrs[i])
+ return (1);
+ }
+ return (0);
+}
+
+#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+ int i;
+ uint64_t crc = -1ULL;
+
+ for (i = 0; i != attr_count; i++)
+ crc ^= SA_ATTR_HASH(attrs[i]);
+
+ return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+ int rc;
+ if (hdl->sa_spill == NULL) {
+ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+ &hdl->sa_spill)) == 0)
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ } else {
+ rc = 0;
+ }
+
+ return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ sa_data_op_t data_op, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int i;
+ int error = 0;
+ sa_buf_type_t buftypes;
+
+ buftypes = 0;
+
+ ASSERT(count > 0);
+ for (i = 0; i != count; i++) {
+ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+ bulk[i].sa_addr = NULL;
+ /* First check the bonus buffer */
+
+ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+ SA_GET_HDR(hdl, SA_BONUS),
+ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+ if (tx && !(buftypes & SA_BONUS)) {
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ buftypes |= SA_BONUS;
+ }
+ }
+ if (bulk[i].sa_addr == NULL &&
+ ((error = sa_get_spill(hdl)) == 0)) {
+ if (TOC_ATTR_PRESENT(
+ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+ SA_GET_HDR(hdl, SA_SPILL),
+ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+ if (tx && !(buftypes & SA_SPILL) &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+ buftypes |= SA_SPILL;
+ }
+ }
+ }
+ if (error && error != ENOENT) {
+ return ((error == ECKSUM) ? EIO : error);
+ }
+
+ switch (data_op) {
+ case SA_LOOKUP:
+ if (bulk[i].sa_addr == NULL)
+ return (ENOENT);
+ if (bulk[i].sa_data) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_addr, bulk[i].sa_data,
+ bulk[i].sa_size);
+ }
+ continue;
+
+ case SA_UPDATE:
+ /* existing rewrite of attr */
+ if (bulk[i].sa_addr &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_addr,
+ bulk[i].sa_length);
+ continue;
+ } else if (bulk[i].sa_addr) { /* attr size change */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_REPLACE, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ } else { /* adding new attribute */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_ADD, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ }
+ if (error)
+ return (error);
+ break;
+ }
+ }
+ return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, *findtb;
+ int i;
+ avl_index_t loc;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+ tb->lot_attr_count = attr_count;
+ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ tb->lot_num = lot_num;
+ tb->lot_hash = hash;
+ tb->lot_instance = 0;
+
+ if (zapadd) {
+ char attr_name[8];
+
+ if (sa->sa_layout_attr_obj == 0) {
+ sa->sa_layout_attr_obj = zap_create(os,
+ DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+ &sa->sa_layout_attr_obj, tx) == 0);
+ }
+
+ (void) snprintf(attr_name, sizeof (attr_name),
+ "%d", (int)lot_num);
+ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+ attr_name, 2, attr_count, attrs, tx));
+ }
+
+ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+ offsetof(sa_idx_tab_t, sa_next));
+
+ for (i = 0; i != attr_count; i++) {
+ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+ tb->lot_var_sizes++;
+ }
+
+ avl_add(&sa->sa_layout_num_tree, tb);
+
+ /* verify we don't have a hash collision */
+ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+ for (; findtb && findtb->lot_hash == hash;
+ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+ if (findtb->lot_instance != tb->lot_instance)
+ break;
+ tb->lot_instance++;
+ }
+ }
+ avl_add(&sa->sa_layout_hash_tree, tb);
+ return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+ int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+ sa_lot_t *tb, tbsearch;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&sa->sa_lock);
+ tbsearch.lot_hash = hash;
+ tbsearch.lot_instance = 0;
+ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+ if (tb) {
+ for (; tb && tb->lot_hash == hash;
+ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+ if (sa_layout_equal(tb, attrs, count) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ tb = sa_add_layout_entry(os, attrs, count,
+ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+ }
+ mutex_exit(&sa->sa_lock);
+ *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+ int error;
+ uint32_t blocksize;
+
+ if (size == 0) {
+ blocksize = SPA_MINBLOCKSIZE;
+ } else if (size > SPA_MAXBLOCKSIZE) {
+ ASSERT(0);
+ return (EFBIG);
+ } else {
+ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+ }
+
+ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+ if (func == NULL) {
+ bcopy(datastart, target, buflen);
+ } else {
+ boolean_t start;
+ int bytes;
+ void *dataptr;
+ void *saptr = target;
+ uint32_t length;
+
+ start = B_TRUE;
+ bytes = 0;
+ while (bytes < buflen) {
+ func(&dataptr, &length, buflen, start, datastart);
+ bcopy(dataptr, saptr, length);
+ saptr = (void *)((caddr_t)saptr + length);
+ bytes += length;
+ start = B_FALSE;
+ }
+ }
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary. It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
+ boolean_t *will_spill)
+{
+ int var_size = 0;
+ int i;
+ int full_space;
+ int hdrsize;
+ boolean_t done = B_FALSE;
+
+ if (buftype == SA_BONUS && sa->sa_force_spill) {
+ *total = 0;
+ *index = 0;
+ *will_spill = B_TRUE;
+ return (0);
+ }
+
+ *index = -1;
+ *total = 0;
+
+ if (buftype == SA_BONUS)
+ *will_spill = B_FALSE;
+
+ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+ sizeof (sa_hdr_phys_t);
+
+ full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+
+ for (i = 0; i != attr_count; i++) {
+ boolean_t is_var_sz;
+
+ *total += attr_desc[i].sa_length;
+ if (done)
+ goto next;
+
+ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+ if (is_var_sz) {
+ var_size++;
+ }
+
+ if (is_var_sz && var_size > 1) {
+ if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+ *total < full_space) {
+ hdrsize += sizeof (uint16_t);
+ } else {
+ done = B_TRUE;
+ *index = i;
+ if (buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ continue;
+ }
+ }
+
+ /*
+ * find index of where spill *could* occur.
+ * Then continue to count of remainder attribute
+ * space. The sum is used later for sizing bonus
+ * and spill buffer.
+ */
+ if (buftype == SA_BONUS && *index == -1 &&
+ P2ROUNDUP(*total + hdrsize, 8) >
+ (full_space - sizeof (blkptr_t))) {
+ *index = i;
+ done = B_TRUE;
+ }
+
+next:
+ if (P2ROUNDUP(*total + hdrsize, 8) > full_space &&
+ buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ }
+
+ hdrsize = P2ROUNDUP(hdrsize, 8);
+ return (hdrsize);
+}
+
+#define BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ uint64_t hash;
+ sa_buf_type_t buftype;
+ sa_hdr_phys_t *sahdr;
+ void *data_start;
+ int buf_space;
+ sa_attr_type_t *attrs, *attrs_start;
+ int i, lot_count;
+ int hdrsize, spillhdrsize;
+ int used;
+ dmu_object_type_t bonustype;
+ sa_lot_t *lot;
+ int len_idx;
+ int spill_used;
+ boolean_t spilling;
+
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+
+ /* first determine bonus header size and sum of all attributes */
+ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+ SA_BONUS, &i, &used, &spilling);
+
+ if (used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ used + hdrsize, tx));
+
+ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+ bonustype == DMU_OT_SA);
+
+ /* setup and size spill buffer when needed */
+ if (spilling) {
+ boolean_t dummy;
+
+ if (hdl->sa_spill == NULL) {
+ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+ &hdl->sa_spill) == 0);
+ }
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+ attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+ &spill_used, &dummy);
+
+ if (spill_used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+ hdl->sa_spill->db_size)
+ VERIFY(0 == sa_resize_spill(hdl,
+ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+ }
+
+ /* setup starting pointers to lay down data */
+ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+ buftype = SA_BONUS;
+
+ if (spilling)
+ buf_space = (sa->sa_force_spill) ?
+ 0 : SA_BLKPTR_SPACE - hdrsize;
+ else
+ buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ lot_count = 0;
+
+ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+ uint16_t length;
+
+ attrs[i] = attr_desc[i].sa_attr;
+ length = SA_REGISTERED_LEN(sa, attrs[i]);
+ if (length == 0)
+ length = attr_desc[i].sa_length;
+
+ if (buf_space < length) { /* switch to spill buffer */
+ VERIFY(bonustype == DMU_OT_SA);
+ if (buftype == SA_BONUS && !sa->sa_force_spill) {
+ sa_find_layout(hdl->sa_os, hash, attrs_start,
+ lot_count, tx, &lot);
+ SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+ }
+
+ buftype = SA_SPILL;
+ hash = -1ULL;
+ len_idx = 0;
+
+ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+ sahdr->sa_magic = SA_MAGIC;
+ data_start = (void *)((uintptr_t)sahdr +
+ spillhdrsize);
+ attrs_start = &attrs[i];
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ lot_count = 0;
+ }
+ hash ^= SA_ATTR_HASH(attrs[i]);
+ attr_desc[i].sa_addr = data_start;
+ attr_desc[i].sa_size = length;
+ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+ data_start, length);
+ if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+ sahdr->sa_lengths[len_idx++] = length;
+ }
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ length), 8);
+ buf_space -= P2ROUNDUP(length, 8);
+ lot_count++;
+ }
+
+ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+ /*
+ * Verify that old znodes always have layout number 0.
+ * Must be DMU_OT_SA for arbitrary layouts
+ */
+ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+ (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+ if (bonustype == DMU_OT_SA) {
+ SA_SET_HDR(sahdr, lot->lot_num,
+ buftype == SA_BONUS ? hdrsize : spillhdrsize);
+ }
+
+ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (!sa->sa_force_spill)
+ VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ if (!spilling) {
+ /*
+ * remove spill block that is no longer needed.
+ */
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+ sa_handle_object(hdl), tx));
+ } else {
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ }
+ }
+
+ return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+ int i;
+
+ if (sa->sa_attr_table == NULL)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+ sa_os_t *sa = os->os_sa;
+ uint64_t sa_attr_count = 0;
+ uint64_t sa_reg_count;
+ int error = 0;
+ uint64_t attr_value;
+ sa_attr_table_t *tb;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int registered_count = 0;
+ int i;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+
+ sa->sa_user_table =
+ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+ if (sa->sa_reg_attr_obj != 0) {
+ error = zap_count(os, sa->sa_reg_attr_obj,
+ &sa_attr_count);
+
+ /*
+ * Make sure we retrieved a count and that it isn't zero
+ */
+ if (error || (error == 0 && sa_attr_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto bail;
+ }
+ sa_reg_count = sa_attr_count;
+ }
+
+ if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+ sa_attr_count += sa_legacy_attr_count;
+
+ /* Allocate attribute numbers for attributes that aren't registered */
+ for (i = 0; i != count; i++) {
+ boolean_t found = B_FALSE;
+ int j;
+
+ if (ostype == DMU_OST_ZFS) {
+ for (j = 0; j != sa_legacy_attr_count; j++) {
+ if (strcmp(reg_attrs[i].sa_name,
+ sa_legacy_attrs[j].sa_name) == 0) {
+ sa->sa_user_table[i] =
+ sa_legacy_attrs[j].sa_attr;
+ found = B_TRUE;
+ }
+ }
+ }
+ if (found)
+ continue;
+
+ if (sa->sa_reg_attr_obj)
+ error = zap_lookup(os, sa->sa_reg_attr_obj,
+ reg_attrs[i].sa_name, 8, 1, &attr_value);
+ else
+ error = ENOENT;
+ switch (error) {
+ case ENOENT:
+ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+ sa_attr_count++;
+ break;
+ case 0:
+ sa->sa_user_table[i] = ATTR_NUM(attr_value);
+ break;
+ default:
+ goto bail;
+ }
+ }
+
+ sa->sa_num_attrs = sa_attr_count;
+ tb = sa->sa_attr_table =
+ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+ /*
+ * Attribute table is constructed from requested attribute list,
+ * previously foreign registered attributes, and also the legacy
+ * ZPL set of attributes.
+ */
+
+ if (sa->sa_reg_attr_obj) {
+ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t value;
+ value = za.za_first_integer;
+
+ registered_count++;
+ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+ tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+ if (tb[ATTR_NUM(value)].sa_name) {
+ continue;
+ }
+ tb[ATTR_NUM(value)].sa_name =
+ kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+ strlen(za.za_name) +1);
+ }
+ zap_cursor_fini(&zc);
+ /*
+ * Make sure we processed the correct number of registered
+ * attributes
+ */
+ if (registered_count != sa_reg_count) {
+ ASSERT(error != 0);
+ goto bail;
+ }
+
+ }
+
+ if (ostype == DMU_OST_ZFS) {
+ for (i = 0; i != sa_legacy_attr_count; i++) {
+ if (tb[i].sa_name)
+ continue;
+ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+ tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+ tb[i].sa_registered = B_FALSE;
+ tb[i].sa_name =
+ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+ KM_SLEEP);
+ (void) strlcpy(tb[i].sa_name,
+ sa_legacy_attrs[i].sa_name,
+ strlen(sa_legacy_attrs[i].sa_name) + 1);
+ }
+ }
+
+ for (i = 0; i != count; i++) {
+ sa_attr_type_t attr_id;
+
+ attr_id = sa->sa_user_table[i];
+ if (tb[attr_id].sa_name)
+ continue;
+
+ tb[attr_id].sa_length = reg_attrs[i].sa_length;
+ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+ tb[attr_id].sa_attr = attr_id;
+ tb[attr_id].sa_name =
+ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+ strlen(reg_attrs[i].sa_name) + 1);
+ }
+
+ sa->sa_need_attr_registration =
+ (sa_attr_count != registered_count);
+
+ return (0);
+bail:
+ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+ sa->sa_user_table = NULL;
+ sa_free_attr_table(sa);
+ return ((error != 0) ? error : EINVAL);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+ sa_attr_type_t **user_table)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ sa_os_t *sa;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+ sa_attr_type_t *tb;
+ int error;
+
+ mutex_enter(&os->os_lock);
+ if (os->os_sa) {
+ mutex_enter(&os->os_sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ tb = os->os_sa->sa_user_table;
+ mutex_exit(&os->os_sa->sa_lock);
+ *user_table = tb;
+ return (0);
+ }
+
+ sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+ mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ sa->sa_master_obj = sa_obj;
+
+ os->os_sa = sa;
+ mutex_enter(&sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+ if (sa_obj) {
+ error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+ 8, 1, &sa->sa_layout_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ error = zap_lookup(os, sa_obj, SA_REGISTRY,
+ 8, 1, &sa->sa_reg_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ }
+
+ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+ goto fail;
+
+ if (sa->sa_layout_attr_obj != 0) {
+ uint64_t layout_count;
+
+ error = zap_count(os, sa->sa_layout_attr_obj,
+ &layout_count);
+
+ /*
+ * Layout number count should be > 0
+ */
+ if (error || (error == 0 && layout_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto fail;
+ }
+
+ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ sa_attr_type_t *lot_attrs;
+ uint64_t lot_num;
+
+ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+ za.za_num_integers, KM_SLEEP);
+
+ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers,
+ lot_attrs))) != 0) {
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ break;
+ }
+ VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num) == 0);
+
+ (void) sa_add_layout_entry(os, lot_attrs,
+ za.za_num_integers, lot_num,
+ sa_layout_info_hash(lot_attrs,
+ za.za_num_integers), B_FALSE, NULL);
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ }
+ zap_cursor_fini(&zc);
+
+ /*
+ * Make sure layout count matches number of entries added
+ * to AVL tree
+ */
+ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+ ASSERT(error != 0);
+ goto fail;
+ }
+ }
+
+ /* Add special layout number for old ZNODES */
+ if (ostype == DMU_OST_ZFS) {
+ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+ sa_legacy_attr_count, 0,
+ sa_layout_info_hash(sa_legacy_zpl_layout,
+ sa_legacy_attr_count), B_FALSE, NULL);
+
+ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+ 0, B_FALSE, NULL);
+ }
+ *user_table = os->os_sa->sa_user_table;
+ mutex_exit(&sa->sa_lock);
+ return (0);
+fail:
+ os->os_sa = NULL;
+ sa_free_attr_table(sa);
+ if (sa->sa_user_table)
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+ mutex_exit(&sa->sa_lock);
+ kmem_free(sa, sizeof (sa_os_t));
+ return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *layout;
+ void *cookie;
+
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+ /* Free up attr table */
+
+ sa_free_attr_table(sa);
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+ sa_idx_tab_t *tab;
+ while (tab = list_head(&layout->lot_idx_tab)) {
+ ASSERT(refcount_count(&tab->sa_refcount));
+ sa_idx_tab_rele(os, tab);
+ }
+ }
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+ kmem_free(layout->lot_attrs,
+ sizeof (sa_attr_type_t) * layout->lot_attr_count);
+ kmem_free(layout, sizeof (sa_lot_t));
+ }
+
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+
+ kmem_free(sa, sizeof (sa_os_t));
+ os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+ sa_idx_tab_t *idx_tab = userp;
+
+ if (var_length) {
+ ASSERT(idx_tab->sa_variable_lengths);
+ idx_tab->sa_variable_lengths[length_idx] = length;
+ }
+ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+ sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+ void *data_start;
+ sa_lot_t *tb = tab;
+ sa_lot_t search;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ int i;
+ uint16_t *length_start = NULL;
+ uint8_t length_idx = 0;
+
+ if (tab == NULL) {
+ search.lot_num = SA_LAYOUT_NUM(hdr, type);
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+ ASSERT(tb);
+ }
+
+ if (IS_SA_BONUSTYPE(type)) {
+ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+ offsetof(sa_hdr_phys_t, sa_lengths) +
+ (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+ length_start = hdr->sa_lengths;
+ } else {
+ data_start = hdr;
+ }
+
+ for (i = 0; i != tb->lot_attr_count; i++) {
+ int attr_length, reg_length;
+ uint8_t idx_len;
+
+ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ if (reg_length) {
+ attr_length = reg_length;
+ idx_len = 0;
+ } else {
+ attr_length = length_start[length_idx];
+ idx_len = length_idx++;
+ }
+
+ func(hdr, data_start, tb->lot_attrs[i], attr_length,
+ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ attr_length), 8);
+ }
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+ sa_handle_t *hdl = userp;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+ dmu_buf_impl_t *db;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int num_lengths = 1;
+ int i;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ if (sa_hdr_phys->sa_magic == SA_MAGIC)
+ return;
+
+ db = SA_GET_DB(hdl, buftype);
+
+ if (buftype == SA_SPILL) {
+ arc_release(db->db_buf, NULL);
+ arc_buf_thaw(db->db_buf);
+ }
+
+ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+ /*
+ * Determine number of variable lenghts in header
+ * The standard 8 byte header has one for free and a
+ * 16 byte header would have 4 + 1;
+ */
+ if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+ for (i = 0; i != num_lengths; i++)
+ sa_hdr_phys->sa_lengths[i] =
+ BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+ sa_byteswap_cb, NULL, hdl);
+
+ if (buftype == SA_SPILL)
+ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys;
+ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_idx_tab_t *idx_tab;
+
+ sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+ mutex_enter(&sa->sa_lock);
+
+ /* Do we need to byteswap? */
+
+ /* only check if not old znode */
+ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+ sa_hdr_phys->sa_magic != 0) {
+ VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+ sa_byteswap(hdl, buftype);
+ }
+
+ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+ if (buftype == SA_BONUS)
+ hdl->sa_bonus_tab = idx_tab;
+ else
+ hdl->sa_spill_tab = idx_tab;
+
+ mutex_exit(&sa->sa_lock);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+sa_evict(dmu_buf_t *db, void *sap)
+{
+ panic("evicting sa dbuf %p\n", (void *)db);
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_idx_tab_t *idx_tab = arg;
+
+ if (idx_tab == NULL)
+ return;
+
+ mutex_enter(&sa->sa_lock);
+ if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+ if (idx_tab->sa_variable_lengths)
+ kmem_free(idx_tab->sa_variable_lengths,
+ sizeof (uint16_t) *
+ idx_tab->sa_layout->lot_var_sizes);
+ refcount_destroy(&idx_tab->sa_refcount);
+ kmem_free(idx_tab->sa_idx_tab,
+ sizeof (uint32_t) * sa->sa_num_attrs);
+ kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+ }
+ mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+ sa_os_t *sa = os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ (void) refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+ mutex_enter(&hdl->sa_lock);
+ (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
+ NULL, NULL, NULL);
+
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (hdl->sa_spill_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ hdl->sa_spill_tab = NULL;
+ }
+
+ dmu_buf_rele(hdl->sa_bonus, NULL);
+
+ if (hdl->sa_spill)
+ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+ mutex_exit(&hdl->sa_lock);
+
+ kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ int error = 0;
+ dmu_object_info_t doi;
+ sa_handle_t *handle;
+
+#ifdef ZFS_DEBUG
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+ doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+ /* find handle, if it exists */
+ /* if one doesn't exist then create a new one, and initialize it */
+
+ handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+ if (handle == NULL) {
+ sa_handle_t *newhandle;
+ handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_userp = userp;
+ handle->sa_bonus = db;
+ handle->sa_os = os;
+ handle->sa_spill = NULL;
+
+ error = sa_build_index(handle, SA_BONUS);
+ newhandle = (hdl_type == SA_HDL_SHARED) ?
+ dmu_buf_set_user_ie(db, handle,
+ NULL, sa_evict) : NULL;
+
+ if (newhandle != NULL) {
+ kmem_cache_free(sa_cache, handle);
+ handle = newhandle;
+ }
+ }
+ *handlepp = handle;
+
+ return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ dmu_buf_t *db;
+ int error;
+
+ if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+ return (error);
+
+ return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+ handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+ return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = buf;
+ bulk.sa_length = buflen;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_lookup_impl(hdl, &bulk, 1);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+ error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+ uio->uio_resid), UIO_READ, uio);
+ }
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+
+}
+#endif
+
+void *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
+{
+ sa_idx_tab_t *idx_tab;
+ sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, search;
+ avl_index_t loc;
+
+ /*
+ * Deterimine layout number. If SA node and header == 0 then
+ * force the index table to the dummy "1" empty layout.
+ *
+ * The layout number would only be zero for a newly created file
+ * that has not added any attributes yet, or with crypto enabled which
+ * doesn't write any attributes to the bonus buffer.
+ */
+
+ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+ /* Verify header size is consistent with layout information */
+ ASSERT(tb);
+ ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+ /*
+ * See if any of the already existing TOC entries can be reused?
+ */
+
+ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+ boolean_t valid_idx = B_TRUE;
+ int i;
+
+ if (tb->lot_var_sizes != 0 &&
+ idx_tab->sa_variable_lengths != NULL) {
+ for (i = 0; i != tb->lot_var_sizes; i++) {
+ if (hdr->sa_lengths[i] !=
+ idx_tab->sa_variable_lengths[i]) {
+ valid_idx = B_FALSE;
+ break;
+ }
+ }
+ }
+ if (valid_idx) {
+ sa_idx_tab_hold(os, idx_tab);
+ return (idx_tab);
+ }
+ }
+
+ /* No such luck, create a new entry */
+ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+ idx_tab->sa_idx_tab =
+ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+ idx_tab->sa_layout = tb;
+ refcount_create(&idx_tab->sa_refcount);
+ if (tb->lot_var_sizes)
+ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+ tb->lot_var_sizes, KM_SLEEP);
+
+ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+ tb, idx_tab);
+ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
+ sa_idx_tab_hold(os, idx_tab); /* one for layout */
+ list_insert_tail(&tb->lot_idx_tab, idx_tab);
+ return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+ boolean_t start, void *userdata)
+{
+ ASSERT(start);
+
+ *dataptr = userdata;
+ *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ uint64_t attr_value = 0;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_attr_table_t *tb = sa->sa_attr_table;
+ int i;
+
+ mutex_enter(&sa->sa_lock);
+
+ if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+ mutex_exit(&sa->sa_lock);
+ return;
+ }
+
+ if (sa->sa_reg_attr_obj == 0) {
+ sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
+ SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
+ }
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_registered)
+ continue;
+ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+ tb[i].sa_byteswap);
+ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+ tb[i].sa_name, 8, 1, &attr_value, tx));
+ tb[i].sa_registered = B_TRUE;
+ }
+ sa->sa_need_attr_registration = B_FALSE;
+ mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file. It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+ return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_replace_all_by_template_locked(hdl, attr_desc,
+ attr_count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * add/remove/replace a single attribute and then rewrite the entire set
+ * of attributes.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+ sa_bulk_attr_t *attr_desc;
+ void *old_data[2];
+ int bonus_attr_count = 0;
+ int bonus_data_size, spill_data_size;
+ int spill_attr_count = 0;
+ int error;
+ uint16_t length;
+ int i, j, k, length_idx;
+ sa_hdr_phys_t *hdr;
+ sa_idx_tab_t *idx_tab;
+ int attr_count;
+ int count;
+
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* First make of copy of the old data */
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
+ bonus_data_size = hdl->sa_bonus->db_size;
+ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+ bcopy(hdl->sa_bonus->db_data, old_data[0],
+ hdl->sa_bonus->db_size);
+ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[0] = NULL;
+ }
+ DB_DNODE_EXIT(db);
+
+ /* Bring spill buffer online if it isn't currently */
+
+ if ((error = sa_get_spill(hdl)) == 0) {
+ spill_data_size = hdl->sa_spill->db_size;
+ old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+ bcopy(hdl->sa_spill->db_data, old_data[1],
+ hdl->sa_spill->db_size);
+ spill_attr_count =
+ hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else if (error && error != ENOENT) {
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ return (error);
+ } else {
+ old_data[1] = NULL;
+ }
+
+ /* build descriptor of all attributes */
+
+ attr_count = bonus_attr_count + spill_attr_count;
+ if (action == SA_ADD)
+ attr_count++;
+ else if (action == SA_REMOVE)
+ attr_count--;
+
+ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+ /*
+ * loop through bonus and spill buffer if it exists, and
+ * build up new attr_descriptor to reset the attributes
+ */
+ k = j = 0;
+ count = bonus_attr_count;
+ hdr = SA_GET_HDR(hdl, SA_BONUS);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+ for (; k != 2; k++) {
+ /* iterate over each attribute in layout */
+ for (i = 0, length_idx = 0; i != count; i++) {
+ sa_attr_type_t attr;
+
+ attr = idx_tab->sa_layout->lot_attrs[i];
+ if (attr == newattr) {
+ if (action == SA_REMOVE) {
+ j++;
+ continue;
+ }
+ ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
+ ASSERT(action == SA_REPLACE);
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ length = SA_REGISTERED_LEN(sa, attr);
+ if (length == 0) {
+ length = hdr->sa_lengths[length_idx++];
+ }
+
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ NULL, (void *)
+ (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+ (uintptr_t)old_data[k]), length);
+ }
+ }
+ if (k == 0 && hdl->sa_spill) {
+ hdr = SA_GET_HDR(hdl, SA_SPILL);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+ count = spill_attr_count;
+ } else {
+ break;
+ }
+ }
+ if (action == SA_ADD) {
+ length = SA_REGISTERED_LEN(sa, newattr);
+ if (length == 0) {
+ length = buflen;
+ }
+ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+ datastart, buflen);
+ }
+
+ error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ if (old_data[1])
+ kmem_free(old_data[1], spill_data_size);
+ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+ return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ dmu_tx_t *tx)
+{
+ int error;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_object_type_t bonustype;
+
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* sync out registration table if necessary */
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+
+ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+ sa->sa_update_cb(hdl, tx);
+
+ return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+ void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = type;
+ bulk.sa_data_func = NULL;
+ bulk.sa_length = buflen;
+ bulk.sa_data = buf;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+ uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = userdata;
+ bulk.sa_data_func = locator;
+ bulk.sa_length = buflen;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+ sa_bulk_attr_t bulk;
+ int error;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+ }
+ *size = bulk.sa_size;
+
+ mutex_exit(&hdl->sa_lock);
+ return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_lookup_locked(hdl, attrs, count);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, attrs, count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+ NULL, 0, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+ dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+ blksize, nblocks);
+}
+
+void
+sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
+{
+ (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
+ oldhdl, newhdl, NULL, sa_evict);
+ oldhdl->sa_bonus = NULL;
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+ hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+ return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+ return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+ os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+ mutex_enter(&os->os_sa->sa_lock);
+ sa_register_update_callback_locked(os, func);
+ mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+ return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+ sa_os_t *sa = os->os_sa;
+
+ if (sa->sa_master_obj)
+ return (1);
+
+ sa->sa_master_obj = sa_object;
+
+ return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+ sa_hdr_phys_t *hdr = arg;
+
+ return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_exit(&hdl->sa_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
index ca7076cb6fd9..816c09aa0371 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -19,111 +19,36 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-3, available at:
- * http://csrc.nist.gov/publications/PubsFIPS.html
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
- *
- * Ch(x, y, z) (x & y) ^ (~x & z)
- * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z)
- *
- * We use equivalent logical reductions here that require one less op.
- */
-#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
-#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
- uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
- for (t = 0; t < 16; t++, cp += 4)
- W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
- for (t = 16; t < 64; t++)
- W[t] = sigma1(W[t - 2]) + W[t - 7] +
- sigma0(W[t - 15]) + W[t - 16];
-
- a = H[0]; b = H[1]; c = H[2]; d = H[3];
- e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
- for (t = 0; t < 64; t++) {
- T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
- T2 = SIGMA0(a) + Maj(a, b, c);
- h = g; g = f; f = e; e = d + T1;
- d = c; c = b; b = a; a = T1 + T2;
- }
-
- H[0] += a; H[1] += b; H[2] += c; H[3] += d;
- H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
+#ifdef _KERNEL
+#include <crypto/sha2/sha2.h>
+#else
+#include <sha256.h>
+#endif
void
zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
- uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
- 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
- uint8_t pad[128];
- int i, padsize;
-
- for (i = 0; i < (size & ~63ULL); i += 64)
- SHA256Transform(H, (uint8_t *)buf + i);
-
- for (padsize = 0; i < size; i++)
- pad[padsize++] = *((uint8_t *)buf + i);
-
- for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
- pad[padsize] = 0;
-
- for (i = 56; i >= 0; i -= 8)
- pad[padsize++] = (size << 3) >> i;
-
- for (i = 0; i < padsize; i += 64)
- SHA256Transform(H, pad + i);
-
- ZIO_SET_CHECKSUM(zcp,
- (uint64_t)H[0] << 32 | H[1],
- (uint64_t)H[2] << 32 | H[3],
- (uint64_t)H[4] << 32 | H[5],
- (uint64_t)H[6] << 32 | H[7]);
+ SHA256_CTX ctx;
+ zio_cksum_t tmp;
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, buf, size);
+ SHA256_Final((unsigned char *)&tmp, &ctx);
+
+ /*
+ * A prior implementation of this function had a
+ * private SHA256 implementation always wrote things out in
+ * Big Endian and there wasn't a byteswap variant of it.
+ * To preseve on disk compatibility we need to force that
+ * behaviour.
+ */
+ zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index c04102e17e84..9336a6b516c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -35,13 +34,14 @@
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
+#include <sys/ddt.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
@@ -56,8 +56,16 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/callb.h>
-#include <sys/sunddi.h>
#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/zone.h>
+#endif /* _KERNEL */
#include "zfs_prop.h"
#include "zfs_comutil.h"
@@ -70,17 +78,17 @@ TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
"Check hostid on import?");
-enum zti_modes {
+typedef enum zti_modes {
zti_mode_fixed, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */
- zti_mode_tune, /* fill from zio_taskq_tune_* */
+ zti_mode_batch, /* cpu-intensive; value is ignored */
zti_mode_null, /* don't create a taskq */
zti_nmodes
-};
+} zti_modes_t;
#define ZTI_FIX(n) { zti_mode_fixed, (n) }
#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
-#define ZTI_TUNE { zti_mode_tune, 0 }
+#define ZTI_BATCH { zti_mode_batch, 0 }
#define ZTI_NULL { zti_mode_null, 0 }
#define ZTI_ONE ZTI_FIX(1)
@@ -91,7 +99,7 @@ typedef struct zio_taskq_info {
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
- "issue", "issue_high", "intr", "intr_high"
+ "issue", "issue_high", "intr", "intr_high"
};
/*
@@ -101,18 +109,36 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
- { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL },
- { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
+ { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
+ { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
};
-enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
-uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */
-
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static dsl_syncfunc_t spa_sync_props;
static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
+
+uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
+#ifdef PSRSET_BIND
+id_t zio_taskq_psrset_bind = PS_NONE;
+#endif
+#ifdef SYSDC
+boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+#endif
+uint_t zio_taskq_basedc = 80; /* base duty cycle */
+
+boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
/*
* ==========================================================================
@@ -149,7 +175,7 @@ static void
spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
{
uint64_t size;
- uint64_t used;
+ uint64_t alloc;
uint64_t cap, version;
zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp;
@@ -157,17 +183,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (spa->spa_root_vdev != NULL) {
- size = spa_get_space(spa);
- used = spa_get_alloc(spa);
+ alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ size = metaslab_class_get_space(spa_normal_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
- spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
- spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
- size - used, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+ size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+ (spa_mode(spa) == FREAD), src);
- cap = (size == 0) ? 0 : (used * 100 / size);
+ cap = (size == 0) ? 0 : (alloc * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+ ddt_get_pool_dedup_ratio(spa), src);
+
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
spa->spa_root_vdev->vdev_state, src);
@@ -202,9 +233,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
int
spa_prop_get(spa_t *spa, nvlist_t **nvp)
{
+ objset_t *mos = spa->spa_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
- objset_t *mos = spa->spa_meta_objset;
int err;
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -217,7 +248,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
spa_prop_get_config(spa, nvp);
/* If no pool property object, no more prop to get. */
- if (spa->spa_pool_props_object == 0) {
+ if (mos == NULL || spa->spa_pool_props_object == 0) {
mutex_exit(&spa->spa_props_lock);
return (0);
}
@@ -338,6 +369,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
+ case ZPOOL_PROP_AUTOEXPAND:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = EINVAL;
@@ -375,12 +407,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
break;
}
- if (error = dmu_objset_open(strval, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os))
+ if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* We don't support gzip bootable datasets */
- if ((error = dsl_prop_get_integer(strval,
+ /* Must be ZPL and not gzip compressed. */
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ error = ENOTSUP;
+ } else if ((error = dsl_prop_get_integer(strval,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
&compress, NULL)) == 0 &&
!BOOTFS_COMPRESS_VALID(compress)) {
@@ -388,7 +422,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
} else {
objnum = dmu_objset_id(os);
}
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
break;
@@ -436,6 +470,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
strcmp(slash, "/..") == 0)
error = EINVAL;
break;
+
+ case ZPOOL_PROP_DEDUPDITTO:
+ if (spa_version(spa) < SPA_VERSION_DEDUP)
+ error = ENOTSUP;
+ else
+ error = nvpair_value_uint64(elem, &intval);
+ if (error == 0 &&
+ intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+ error = EINVAL;
+ break;
}
if (error)
@@ -497,7 +541,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
nvpair_name(elem))) == ZPROP_INVAL)
return (EINVAL);
- if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+ if (prop == ZPOOL_PROP_CACHEFILE ||
+ prop == ZPOOL_PROP_ALTROOT ||
+ prop == ZPOOL_PROP_READONLY)
continue;
need_sync = B_TRUE;
@@ -569,20 +615,55 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
offsetof(spa_error_entry_t, se_avl));
}
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa, int mode)
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+ uint_t value)
{
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ uint_t flags = TASKQ_PREPOPULATE;
+ boolean_t batch = B_FALSE;
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_mode = mode;
+ switch (mode) {
+ case zti_mode_null:
+ return (NULL); /* no taskq needed */
+
+ case zti_mode_fixed:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ break;
- spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ case zti_mode_batch:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = zio_taskq_batch_pct;
+ break;
+ case zti_mode_online_percent:
+ flags |= TASKQ_THREADS_CPU_PCT;
+ break;
+
+ default:
+ panic("unrecognized mode for %s taskq (%u:%u) in "
+ "spa_activate()",
+ name, mode, value);
+ break;
+ }
+
+#ifdef SYSDC
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
+
+ return (taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags));
+ }
+#endif
+ return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+ spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
@@ -593,43 +674,137 @@ spa_activate(spa_t *spa, int mode)
(void) snprintf(name, sizeof (name),
"%s_%s", zio_type_name[t], zio_taskq_types[q]);
- if (mode == zti_mode_tune) {
- mode = zio_taskq_tune_mode;
- value = zio_taskq_tune_value;
- if (mode == zti_mode_tune)
- mode = zti_mode_online_percent;
- }
+ spa->spa_zio_taskq[t][q] =
+ spa_taskq_create(spa, name, mode, value);
+ }
+ }
+}
+
+#ifdef _KERNEL
+#ifdef SPA_PROCESS
+static void
+spa_thread(void *arg)
+{
+ callb_cpr_t cprinfo;
- switch (mode) {
- case zti_mode_fixed:
- ASSERT3U(value, >=, 1);
- value = MAX(value, 1);
+ spa_t *spa = arg;
+ user_t *pu = PTOU(curproc);
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+ spa->spa_name);
+
+ ASSERT(curproc != &p0);
+ (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+ "zpool-%s", spa->spa_name);
+ (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+#ifdef PSRSET_BIND
+ /* bind this thread to the requested psrset */
+ if (zio_taskq_psrset_bind != PS_NONE) {
+ pool_lock();
+ mutex_enter(&cpu_lock);
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+ 0, NULL, NULL) == 0) {
+ curthread->t_bind_pset = zio_taskq_psrset_bind;
+ } else {
+ cmn_err(CE_WARN,
+ "Couldn't bind process for zfs pool \"%s\" to "
+ "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+ }
- spa->spa_zio_taskq[t][q] = taskq_create(name,
- value, maxclsyspri, 50, INT_MAX,
- TASKQ_PREPOPULATE);
- break;
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+ }
+#endif
- case zti_mode_online_percent:
- spa->spa_zio_taskq[t][q] = taskq_create(name,
- value, maxclsyspri, 50, INT_MAX,
- TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
- break;
+#ifdef SYSDC
+ if (zio_taskq_sysdc) {
+ sysdc_thread_enter(curthread, 100, 0);
+ }
+#endif
- case zti_mode_null:
- spa->spa_zio_taskq[t][q] = NULL;
- break;
+ spa->spa_proc = curproc;
+ spa->spa_did = curthread->t_did;
- case zti_mode_tune:
- default:
- panic("unrecognized mode for "
- "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
- "in spa_activate()",
- t, q, mode, value);
- break;
+ spa_create_zio_taskqs(spa);
+
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+ spa->spa_proc_state = SPA_PROC_ACTIVE;
+ cv_broadcast(&spa->spa_proc_cv);
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+ ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+ spa->spa_proc_state = SPA_PROC_GONE;
+ spa->spa_proc = &p0;
+ cv_broadcast(&spa->spa_proc_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
+
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+}
+#endif /* SPA_PROCESS */
+#endif
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, int mode)
+{
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_mode = mode;
+
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+ /* Try to create a covering process */
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+ ASSERT(spa->spa_proc == &p0);
+ spa->spa_did = 0;
+
+#ifdef SPA_PROCESS
+ /* Only create a process if we're going to be around a while. */
+ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+ if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+ NULL, 0) == 0) {
+ spa->spa_proc_state = SPA_PROC_CREATED;
+ while (spa->spa_proc_state == SPA_PROC_CREATED) {
+ cv_wait(&spa->spa_proc_cv,
+ &spa->spa_proc_lock);
}
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ ASSERT(spa->spa_proc != &p0);
+ ASSERT(spa->spa_did != 0);
+ } else {
+#ifdef _KERNEL
+ cmn_err(CE_WARN,
+ "Couldn't create process for zfs pool \"%s\"\n",
+ spa->spa_name);
+#endif
}
}
+#endif /* SPA_PROCESS */
+ mutex_exit(&spa->spa_proc_lock);
+
+ /* If we didn't create a process, we need to create our taskqs. */
+ ASSERT(spa->spa_proc == &p0);
+ if (spa->spa_proc == &p0) {
+ spa_create_zio_taskqs(spa);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
@@ -688,6 +863,33 @@ spa_deactivate(spa_t *spa)
avl_destroy(&spa->spa_errlist_last);
spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+ mutex_enter(&spa->spa_proc_lock);
+ if (spa->spa_proc_state != SPA_PROC_NONE) {
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+ cv_broadcast(&spa->spa_proc_cv);
+ while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+ ASSERT(spa->spa_proc != &p0);
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+ spa->spa_proc_state = SPA_PROC_NONE;
+ }
+ ASSERT(spa->spa_proc == &p0);
+ mutex_exit(&spa->spa_proc_lock);
+
+#ifdef SPA_PROCESS
+ /*
+ * We want to make sure spa_thread() has actually exited the ZFS
+ * module, so that the module can't be unloaded out from underneath
+ * it.
+ */
+ if (spa->spa_did != 0) {
+ thread_join(spa->spa_did);
+ spa->spa_did = 0;
+ }
+#endif /* SPA_PROCESS */
}
/*
@@ -701,7 +903,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
uint_t id, int atype)
{
nvlist_t **child;
- uint_t c, children;
+ uint_t children;
int error;
if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
@@ -722,7 +924,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
return (EINVAL);
}
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vdev_t *vd;
if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
atype)) != 0) {
@@ -768,14 +970,19 @@ spa_unload(spa_t *spa)
spa->spa_async_zio_root = NULL;
}
+ bpobj_close(&spa->spa_deferred_bpobj);
+
/*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
dsl_pool_close(spa->spa_dsl_pool);
spa->spa_dsl_pool = NULL;
+ spa->spa_meta_objset = NULL;
}
+ ddt_unload(spa);
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -928,7 +1135,7 @@ spa_load_spares(spa_t *spa)
KM_SLEEP);
for (i = 0; i < spa->spa_spares.sav_count; i++)
spares[i] = vdev_config_generate(spa,
- spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -950,7 +1157,7 @@ spa_load_l2cache(spa_t *spa)
nvlist_t **l2cache;
uint_t nl2cache;
int i, j, oldnvdevs;
- uint64_t guid, size;
+ uint64_t guid;
vdev_t *vd, **oldvdevs, **newvdevs;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
@@ -1014,12 +1221,8 @@ spa_load_l2cache(spa_t *spa)
(void) vdev_validate_aux(vd);
- if (!vdev_is_dead(vd)) {
- size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ if (!vdev_is_dead(vd))
+ l2arc_add_vdev(spa, vd);
}
}
@@ -1058,7 +1261,7 @@ spa_load_l2cache(spa_t *spa)
l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
- sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
out:
@@ -1098,9 +1301,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
static void
spa_check_removed(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
spa_check_removed(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
@@ -1110,36 +1311,131 @@ spa_check_removed(vdev_t *vd)
}
/*
- * Load the slog device state from the config object since it's possible
- * that the label does not contain the most up-to-date information.
+ * Validate the current config against the MOS config
*/
-void
-spa_load_log_state(spa_t *spa)
+static boolean_t
+spa_config_valid(spa_t *spa, nvlist_t *config)
{
- nvlist_t *nv, *nvroot, **child;
- uint64_t is_log;
- uint_t children, c;
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv;
- VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
- for (c = 0; c < children; c++) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+ ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+
+ /*
+ * If we're doing a normal import, then build up any additional
+ * diagnostic information about missing devices in this config.
+ * We'll pass this up to the user for further processing.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+ nvlist_t **child, *nv;
+ uint64_t idx = 0;
+
+ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+ KM_SLEEP);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops &&
+ mtvd->vdev_islog)
+ child[idx++] = vdev_config_generate(spa, mtvd,
+ B_FALSE, 0);
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+
+ for (int i = 0; i < idx; i++)
+ nvlist_free(child[i]);
+ }
+ nvlist_free(nv);
+ kmem_free(child, rvd->vdev_children * sizeof (char **));
+ }
+
+ /*
+ * Compare the root vdev tree with the information we have
+ * from the MOS config (mrvd). Check each top-level vdev
+ * with the corresponding MOS config top-level (mtvd).
+ */
+ for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ /*
+ * Resolve any "missing" vdevs in the current configuration.
+ * If we find that the MOS config has more accurate information
+ * about the top-level vdev then use that vdev instead.
+ */
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops) {
- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log) == 0 && is_log)
- vdev_load_log_state(tvd, child[c]);
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
+ continue;
+
+ /*
+ * Device specific actions.
+ */
+ if (mtvd->vdev_islog) {
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ /*
+ * XXX - once we have 'readonly' pool
+ * support we should be able to handle
+ * missing data devices by transitioning
+ * the pool to readonly.
+ */
+ continue;
+ }
+
+ /*
+ * Swap the missing vdev with the data we were
+ * able to obtain from the MOS config.
+ */
+ vdev_remove_child(rvd, tvd);
+ vdev_remove_child(mrvd, mtvd);
+
+ vdev_add_child(rvd, mtvd);
+ vdev_add_child(mrvd, tvd);
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ vdev_load(mtvd);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ vdev_reopen(rvd);
+ } else if (mtvd->vdev_islog) {
+ /*
+ * Load the slog device's state from the MOS config
+ * since it's possible that the label does not
+ * contain the most up-to-date information.
+ */
+ vdev_load_log_state(tvd, mtvd);
+ vdev_reopen(tvd);
+ }
}
- nvlist_free(nv);
+ vdev_free(mrvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Ensure we were able to validate the config.
+ */
+ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
}
/*
* Check for missing log devices
*/
-int
+static int
spa_check_logs(spa_t *spa)
{
switch (spa->spa_log_state) {
@@ -1148,7 +1444,7 @@ spa_check_logs(spa_t *spa)
case SPA_LOG_UNKNOWN:
if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
DS_FIND_CHILDREN)) {
- spa->spa_log_state = SPA_LOG_MISSING;
+ spa_set_log_state(spa, SPA_LOG_MISSING);
return (1);
}
break;
@@ -1156,47 +1452,310 @@ spa_check_logs(spa_t *spa)
return (0);
}
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t slog_found = B_FALSE;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ if (!spa_has_slogs(spa))
+ return (B_FALSE);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog) {
+ metaslab_group_passivate(mg);
+ slog_found = B_TRUE;
+ }
+ }
+
+ return (slog_found);
+}
+
+static void
+spa_activate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog)
+ metaslab_group_activate(mg);
+ }
+}
+
+int
+spa_offline_log(spa_t *spa)
+{
+ int error = 0;
+
+ if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ NULL, DS_FIND_CHILDREN)) == 0) {
+
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+}
+
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+ int i;
+
+ for (i = 0; i < sav->sav_count; i++)
+ spa_check_removed(sav->sav_vdevs[i]);
+}
+
+void
+spa_claim_notify(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ if (zio->io_error)
+ return;
+
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+typedef struct spa_load_error {
+ uint64_t sle_meta_count;
+ uint64_t sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ spa_load_error_t *sle = zio->io_private;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ int error = zio->io_error;
+
+ if (error) {
+ if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+ type != DMU_OT_INTENT_LOG)
+ atomic_add_64(&sle->sle_meta_count, 1);
+ else
+ atomic_add_64(&sle->sle_data_count, 1);
+ }
+ zio_data_buf_free(zio->io_data, zio->io_size);
+}
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ if (bp != NULL) {
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = zio_data_buf_alloc(size);
+
+ zio_nowait(zio_read(rio, spa, bp, data, size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ }
+ return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+ zio_t *rio;
+ spa_load_error_t sle = { 0 };
+ zpool_rewind_policy_t policy;
+ boolean_t verify_ok = B_FALSE;
+ int error;
+
+ zpool_get_rewind_policy(spa->spa_config, &policy);
+
+ if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+ return (0);
+
+ rio = zio_root(spa, NULL, &sle,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+
+ (void) zio_wait(rio);
+
+ spa->spa_load_meta_errors = sle.sle_meta_count;
+ spa->spa_load_data_errors = sle.sle_data_count;
+
+ if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+ sle.sle_data_count <= policy.zrp_maxdata) {
+ int64_t loss = 0;
+
+ verify_ok = B_TRUE;
+ spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+ VERIFY(nvlist_add_int64(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
+ } else {
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ }
+
+ if (error) {
+ if (error != ENXIO && error != EIO)
+ error = EIO;
+ return (error);
+ }
+
+ return (verify_ok ? 0 : EIO);
+}
+
/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
*/
static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
{
- int error = 0;
- nvlist_t *nvroot = NULL;
- vdev_t *rvd;
- uberblock_t *ub = &spa->spa_uberblock;
- uint64_t config_cache_txg = spa->spa_config_txg;
- uint64_t pool_guid;
- uint64_t version;
- uint64_t autoreplace = 0;
- int orig_mode = spa->spa_mode;
- char *ereport = FM_EREPORT_ZFS_POOL;
+ return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val));
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (err);
+}
+
+/*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
+ }
+ }
/*
- * If this is an untrusted config, access the pool in read-only mode.
- * This prevents things like resilvering recently removed devices.
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
*/
- if (!mosconfig)
- spa->spa_mode = FREAD;
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+}
- spa->spa_load_state = state;
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+ boolean_t mosconfig)
+{
+ nvlist_t *config = spa->spa_config;
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ int error;
+ uint64_t pool_guid;
+ nvlist_t *nvl;
- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
- error = EINVAL;
- goto out;
- }
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ return (EINVAL);
/*
* Versioning wasn't explicitly added to the label until later, so if
* it's not present treat it as the initial version.
*/
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
- version = SPA_VERSION_INITIAL;
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
&spa->spa_config_txg);
@@ -1204,10 +1763,70 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
spa_guid_exists(pool_guid, 0)) {
error = EEXIST;
- goto out;
+ } else {
+ spa->spa_load_guid = pool_guid;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ &nvl) == 0) {
+ VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+ KM_SLEEP) == 0);
+ }
+
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, pool_guid, config, state, type,
+ mosconfig, &ereport);
}
- spa->spa_load_guid = pool_guid;
+ spa->spa_minref = refcount_count(&spa->spa_refcount);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ }
+ }
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t children, config_cache_txg = spa->spa_config_txg;
+ int orig_mode = spa->spa_mode;
+ int parse;
+ uint64_t obj;
+
+ /*
+ * If this is an untrusted config, access the pool in read-only mode.
+ * This prevents things like resilvering recently removed devices.
+ */
+ if (!mosconfig)
+ spa->spa_mode = FREAD;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+ return (EINVAL);
+
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
/*
* Create "The Godfather" zio to hold all async IOs
@@ -1221,15 +1840,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* configuration requires knowing the version number.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- spa->spa_ubsync.ub_version = version;
- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
- goto out;
+ return (error);
ASSERT(spa->spa_root_vdev == rvd);
- ASSERT(spa_guid(spa) == pool_guid);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
/*
* Try to open all vdevs, loading each label in the process.
@@ -1238,26 +1859,31 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
error = vdev_open(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0)
- goto out;
+ return (error);
/*
* We need to validate the vdev labels against the configuration that
* we have in hand, which is dependent on the setting of mosconfig. If
* mosconfig is true then we're validating the vdev labels based on
- * that config. Otherwise, we're validating against the cached config
+ * that config. Otherwise, we're validating against the cached config
* (zpool.cache) that was read when we loaded the zfs module, and then
* later we will recursively call spa_load() and validate against
* the vdev config.
+ *
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
*/
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0)
- goto out;
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
+ if (error != 0)
+ return (error);
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
}
/*
@@ -1268,32 +1894,33 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* If we weren't able to find a single valid uberblock, return failure.
*/
- if (ub->ub_txg == 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = ENXIO;
- goto out;
- }
+ if (ub->ub_txg == 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
/*
* If the pool is newer than the code, we can't open it.
*/
- if (ub->ub_version > SPA_VERSION) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_VERSION_NEWER);
- error = ENOTSUP;
- goto out;
- }
+ if (ub->ub_version > SPA_VERSION)
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
/*
* If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration.
+ * incomplete configuration. We first check to see if the pool
+ * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+ * If it is, defer the vdev_guid_sum check till later so we
+ * can handle missing vdevs.
*/
- if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_GUID_SUM);
- error = ENXIO;
- goto out;
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ rvd->vdev_guid_sum != ub->ub_guid_sum)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
}
/*
@@ -1301,221 +1928,174 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
*/
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_ubsync = spa->spa_uberblock;
- spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+
error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
- if (error) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- goto out;
- }
+ if (error)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (!mosconfig) {
- nvlist_t *newconfig;
uint64_t hostid;
+ nvlist_t *policy = NULL, *nvconfig;
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
- VERIFY(nvlist_lookup_string(newconfig,
+ VERIFY(nvlist_lookup_string(nvconfig,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
+#ifdef _KERNEL
+ myhostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so
+ * we can't use zone_get_hostid().
+ */
(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+#endif /* _KERNEL */
if (check_hostid && hostid != 0 && myhostid != 0 &&
- (unsigned long)hostid != myhostid) {
+ hostid != myhostid) {
+ nvlist_free(nvconfig);
cmn_err(CE_WARN, "pool '%s' could not be "
"loaded as it was last accessed by "
"another system (host: %s hostid: 0x%lx). "
"See: http://www.sun.com/msg/ZFS-8000-EY",
spa_name(spa), hostname,
(unsigned long)hostid);
- error = EBADF;
- goto out;
+ return (EBADF);
}
}
+ if (nvlist_lookup_nvlist(spa->spa_config,
+ ZPOOL_REWIND_POLICY, &policy) == 0)
+ VERIFY(nvlist_add_nvlist(nvconfig,
+ ZPOOL_REWIND_POLICY, policy) == 0);
- spa_config_set(spa, newconfig);
+ spa_config_set(spa, nvconfig);
spa_unload(spa);
spa_deactivate(spa);
spa_activate(spa, orig_mode);
- return (spa_load(spa, newconfig, state, B_TRUE));
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the bit that tells us to use the new accounting function
* (raid-z deflation). If we have an older pool, this will not
* be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
- sizeof (uint64_t), 1, &spa->spa_errlog_last);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
- sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+ &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the history object. If we have an older pool, this
* will not be present.
*/
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
- sizeof (uint64_t), 1, &spa->spa_history);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * If we're assembling the pool from the split-off vdevs of
+ * an existing pool, we don't want to attach the spares & cache
+ * devices.
+ */
/*
* Load any hot spares for this pool.
*/
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
- if (error == 0) {
+ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
if (load_nvlist(spa, spa->spa_spares.sav_object,
- &spa->spa_spares.sav_config) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ &spa->spa_spares.sav_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_spares.sav_sync = B_TRUE;
}
/*
* Load any level 2 ARC devices for this pool.
*/
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+ error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
&spa->spa_l2cache.sav_object);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
- if (error == 0) {
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
if (load_nvlist(spa, spa->spa_l2cache.sav_object,
- &spa->spa_l2cache.sav_config) != 0) {
- vdev_set_state(rvd, B_TRUE,
- VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ &spa->spa_l2cache.sav_config) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_l2cache.sav_sync = B_TRUE;
}
- spa_load_log_state(spa);
-
- if (spa_check_logs(spa)) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LOG);
- error = ENXIO;
- ereport = FM_EREPORT_ZFS_LOG_REPLAY;
- goto out;
- }
-
-
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
- if (error && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
+ if (error && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
if (error == 0) {
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
- sizeof (uint64_t), 1, &spa->spa_bootfs);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
- sizeof (uint64_t), 1, &autoreplace);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
- sizeof (uint64_t), 1, &spa->spa_delegation);
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
- sizeof (uint64_t), 1, &spa->spa_failmode);
+ uint64_t autoreplace;
+
+ spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+ spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+ spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+ &spa->spa_dedup_ditto);
+
+ spa->spa_autoreplace = (autoreplace != 0);
}
/*
@@ -1525,8 +2105,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* unopenable vdevs so that the normal autoreplace handler can take
* over.
*/
- if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
spa_check_removed(spa->spa_root_vdev);
+ /*
+ * For the import case, this is done in spa_import(), because
+ * at this point we're using the spare definitions from
+ * the MOS config, not necessarily from the userland config.
+ */
+ if (state != SPA_LOAD_IMPORT) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+ }
/*
* Load the vdev state for all toplevel vdevs.
@@ -1541,15 +2131,60 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, SCL_ALL, FTAG);
/*
- * Check the state of the root vdev. If it can't be opened, it
- * indicates one or more toplevel vdevs are faulted.
+ * Load the DDTs (dedup tables).
*/
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
+ error = ddt_load(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ spa_update_dspace(spa);
+
+ /*
+ * Validate the config, using the MOS config to fill in any
+ * information which might be missing. If we fail to validate
+ * the config then declare the pool unfit for use. If we're
+ * assembling a pool from a split, the log is not transferred
+ * over.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ nvlist_t *nvconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!spa_config_valid(spa, nvconfig)) {
+ nvlist_free(nvconfig);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+ nvlist_free(nvconfig);
+
+ /*
+ * Now that we've validate the config, check the state of the
+ * root vdev. If it can't be opened, it indicates one or
+ * more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+
+ if (spa_check_logs(spa)) {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+ }
}
- if (spa_writeable(spa)) {
+ /*
+ * We've successfully opened the pool, verify that we're ready
+ * to start pushing transactions.
+ */
+ if (state != SPA_LOAD_TRYIMPORT) {
+ if (error = spa_load_verify(spa))
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
+ }
+
+ if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+ spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
@@ -1558,31 +2193,44 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* Claim log blocks that haven't been committed yet.
* This must all happen in a single txg.
+ * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+ * invoked from zil_claim_log_block()'s i/o done callback.
+ * Price of rollback is that we abandon the log.
*/
+ spa->spa_claiming = B_TRUE;
+
tx = dmu_tx_create_assigned(spa_get_dsl(spa),
spa_first_txg(spa));
(void) dmu_objset_find(spa_name(spa),
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
- spa->spa_log_state = SPA_LOG_GOOD;
+ spa->spa_claiming = B_FALSE;
+
+ spa_set_log_state(spa, SPA_LOG_GOOD);
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
/*
- * Wait for all claims to sync.
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by either zil_check_log_chain()
+ * (invoked from spa_check_logs()) or zil_claim() above.
*/
- txg_wait_synced(spa->spa_dsl_pool, 0);
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
* If the config cache is stale, or we have uninitialized
* metaslabs (see spa_vdev_add()), then update the config.
*
- * If spa_load_verbatim is true, trust the current
+ * If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
+ state == SPA_LOAD_IMPORT ||
+ state == SPA_LOAD_RECOVER ||
+ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
need_update = B_TRUE;
for (int c = 0; c < rvd->vdev_children; c++)
@@ -1599,19 +2247,100 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
/*
* Check all DTLs to see if anything needs resilvering.
*/
- if (vdev_resilver_needed(rvd, NULL, NULL))
+ if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(rvd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ /*
+ * Delete any inconsistent datasets.
+ */
+ (void) dmu_objset_find(spa_name(spa),
+ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Clean up any stale temporary dataset userrefs.
+ */
+ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
}
- error = 0;
-out:
- spa->spa_minref = refcount_count(&spa->spa_refcount);
- if (error && error != EBADF)
- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
- spa->spa_load_state = SPA_LOAD_NONE;
- spa->spa_ena = 0;
+ return (0);
+}
- return (error);
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+ int mode = spa->spa_mode;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+
+ spa->spa_load_max_txg--;
+
+ spa_activate(spa, mode);
+ spa_async_suspend(spa);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+ uint64_t max_request, int rewind_flags)
+{
+ nvlist_t *config = NULL;
+ int load_error, rewind_error;
+ uint64_t safe_rewind_txg;
+ uint64_t min_txg;
+
+ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ spa->spa_load_max_txg = spa->spa_load_txg;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ spa->spa_load_max_txg = max_request;
+ }
+
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+ mosconfig);
+ if (load_error == 0)
+ return (0);
+
+ if (spa->spa_root_vdev != NULL)
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ nvlist_free(config);
+ return (load_error);
+ }
+
+ /* Price of rolling back is discarding txgs, including log */
+ if (state == SPA_LOAD_RECOVER)
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+
+ spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ TXG_INITIAL : safe_rewind_txg;
+
+ /*
+ * Continue as long as we're finding errors, we're still within
+ * the acceptable rewind range, and we're still finding uberblocks
+ */
+ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ if (spa->spa_load_max_txg < safe_rewind_txg)
+ spa->spa_extreme_rewind = B_TRUE;
+ rewind_error = spa_load_retry(spa, state, mosconfig);
+ }
+
+ spa->spa_extreme_rewind = B_FALSE;
+ spa->spa_load_max_txg = UINT64_MAX;
+
+ if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ spa_config_set(spa, config);
+
+ return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
}
/*
@@ -1627,11 +2356,14 @@ out:
* ambiguous state.
*/
static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ nvlist_t **config)
{
spa_t *spa;
+ spa_load_state_t state = SPA_LOAD_OPEN;
int error;
int locked = B_FALSE;
+ int firstopen = B_FALSE;
*spapp = NULL;
@@ -1651,11 +2383,24 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
mutex_exit(&spa_namespace_lock);
return (ENOENT);
}
+
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ zpool_rewind_policy_t policy;
+
+ firstopen = B_TRUE;
+
+ zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
+ &policy);
+ if (policy.zrp_request & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
spa_activate(spa, spa_mode_global);
- error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+ error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+ policy.zrp_request);
if (error == EBADF) {
/*
@@ -1680,38 +2425,66 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
* information: the state of each vdev after the
* attempted vdev_open(). Return this to the user.
*/
- if (config != NULL && spa->spa_root_vdev != NULL)
- *config = spa_config_generate(spa, NULL, -1ULL,
- B_TRUE);
+ if (config != NULL && spa->spa_config) {
+ VERIFY(nvlist_dup(spa->spa_config, config,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist(*config,
+ ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
spa_unload(spa);
spa_deactivate(spa);
- spa->spa_last_open_failed = B_TRUE;
+ spa->spa_last_open_failed = error;
if (locked)
mutex_exit(&spa_namespace_lock);
*spapp = NULL;
return (error);
- } else {
- spa->spa_last_open_failed = B_FALSE;
}
}
spa_open_ref(spa, tag);
- if (locked)
+ if (config != NULL)
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ /*
+ * If we've recovered the pool, pass back any information we
+ * gathered while doing the load.
+ */
+ if (state == SPA_LOAD_RECOVER) {
+ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+
+ if (locked) {
+ spa->spa_last_open_failed = 0;
+ spa->spa_last_ubsync_txg = 0;
+ spa->spa_load_txg = 0;
mutex_exit(&spa_namespace_lock);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ if (firstopen)
+ zvol_create_minors(pool);
+#endif
+#endif
+ }
*spapp = spa;
- if (config != NULL)
- *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
return (0);
}
int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+ nvlist_t **config)
+{
+ return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
spa_open(const char *name, spa_t **spapp, void *tag)
{
- return (spa_open_common(name, spapp, tag, NULL));
+ return (spa_open_common(name, spapp, tag, NULL, NULL));
}
/*
@@ -1782,7 +2555,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
if (spa_spare_exists(guid, &pool, NULL) &&
pool != 0ULL) {
VERIFY(nvlist_lookup_uint64_array(
- spares[i], ZPOOL_CONFIG_STATS,
+ spares[i], ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
vs->vs_state = VDEV_STATE_CANT_OPEN;
vs->vs_aux = VDEV_AUX_SPARED;
@@ -1839,7 +2612,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
ASSERT(vd != NULL);
VERIFY(nvlist_lookup_uint64_array(l2cache[i],
- ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
vdev_get_stats(vd, vs);
}
}
@@ -1852,7 +2626,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_t *spa;
*config = NULL;
- error = spa_open_common(name, &spa, FTAG, config);
+ error = spa_open_common(name, &spa, FTAG, NULL, config);
if (spa != NULL) {
/*
@@ -1863,6 +2637,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
if (*config != NULL) {
+ uint64_t loadtimes[2];
+
+ loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+ loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+ VERIFY(nvlist_add_uint64_array(*config,
+ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
VERIFY(nvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
spa_get_errlog_size(spa)) == 0);
@@ -2092,11 +2873,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
- int c, error = 0;
+ int error = 0;
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version;
+ uint64_t version, obj;
/*
* If this pool already exists, return failure.
@@ -2112,11 +2893,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
+ spa = spa_add(pool, NULL, altroot);
spa_activate(spa, spa_mode_global);
- spa->spa_uberblock.ub_txg = txg - 1;
-
if (props && (error = spa_prop_validate(spa, props))) {
spa_deactivate(spa);
spa_remove(spa);
@@ -2128,6 +2907,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
&version) != 0)
version = SPA_VERSION;
ASSERT(version <= SPA_VERSION);
+
+ spa->spa_first_txg = txg;
+ spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
@@ -2154,9 +2936,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_init(rvd->vdev_child[c], txg);
- vdev_config_dirty(rvd);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_metaslab_set_size(rvd->vdev_child[c]);
+ vdev_expand(rvd->vdev_child[c], txg);
+ }
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2202,6 +2985,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
spa->spa_meta_objset = dp->dp_meta_objset;
+ /*
+ * Create DDTs (dedup tables).
+ */
+ ddt_create(spa);
+
+ spa_update_dspace(spa);
+
tx = dmu_tx_create_assigned(dp, txg);
/*
@@ -2217,6 +3007,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
cmn_err(CE_PANIC, "failed to add pool config");
}
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+ sizeof (uint64_t), 1, &version, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool version");
+ }
+
/* Newly created pools with the right version are always deflated. */
if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
spa->spa_deflate = TRUE;
@@ -2228,20 +3024,20 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
}
/*
- * Create the deferred-free bplist object. Turn off compression
+ * Create the deferred-free bpobj. Turn off compression
* because sync-to-convergence takes longer if the blocksize
* keeps changing.
*/
- spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
- 1 << 14, tx);
- dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, obj,
ZIO_COMPRESS_OFF, tx);
-
if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add bplist");
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+ sizeof (uint64_t), 1, &obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bpobj");
}
+ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+ spa->spa_meta_objset, obj));
/*
* Create the pool's history object.
@@ -2255,9 +3051,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_sync_props(spa, props, CRED(), tx);
+ spa_sync_props(spa, props, tx);
}
dmu_tx_commit(tx);
@@ -2275,6 +3073,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
+ spa_history_log_version(spa, LOG_POOL_CREATE);
spa->spa_minref = refcount_count(&spa->spa_refcount);
@@ -2283,32 +3082,39 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
return (0);
}
-#ifdef sun
+#if defined(sun)
#ifdef _KERNEL
/*
- * Build a "root" vdev for a top level vdev read in from a rootpool
- * device label.
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
*/
-static void
-spa_build_rootpool_config(nvlist_t *config)
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+static nvlist_t *
+spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
{
+ nvlist_t *config;
nvlist_t *nvtop, *nvroot;
uint64_t pgid;
+ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+ return (NULL);
+
/*
* Add this top-level vdev to the child array.
*/
- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
- == 0);
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
- == 0);
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
/*
* Put this pool's top-level vdevs into a root vdev.
*/
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
- == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
@@ -2320,127 +3126,40 @@ spa_build_rootpool_config(nvlist_t *config)
*/
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
+ return (config);
}
/*
- * Get the root pool information from the root disk, then import the root pool
- * during the system boot up time.
- */
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
-int
-spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
- uint64_t *besttxg)
-{
- nvlist_t *config;
- uint64_t txg;
- int error;
-
- if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
- return (error);
-
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
-
- if (bestconf != NULL)
- *bestconf = config;
- else
- nvlist_free(config);
- *besttxg = txg;
- return (0);
-}
-
-boolean_t
-spa_rootdev_validate(nvlist_t *nv)
-{
- uint64_t ival;
-
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Given the boot device's physical path or devid, check if the device
- * is in a valid state. If so, return the configuration from the vdev
- * label.
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
*/
-int
-spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
+static void
+spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
{
- nvlist_t *conf = NULL;
- uint64_t txg = 0;
- nvlist_t *nvtop, **child;
- char *type;
- char *bootpath = NULL;
- uint_t children, c;
- char *tmp;
- int error;
-
- if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
- *tmp = '\0';
- if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
- cmn_err(CE_NOTE, "error reading device label");
- return (error);
- }
- if (txg == 0) {
- cmn_err(CE_NOTE, "this device is detached");
- nvlist_free(conf);
- return (EINVAL);
- }
-
- VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
- &nvtop) == 0);
- VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
- if (strcmp(type, VDEV_TYPE_DISK) == 0) {
- if (spa_rootdev_validate(nvtop)) {
- goto out;
- } else {
- nvlist_free(conf);
- return (EINVAL);
- }
- }
+ if (vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t *label;
+ uint64_t label_txg;
- ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
+ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+ &label) != 0)
+ return;
- VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
+ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &label_txg) == 0);
- /*
- * Go thru vdevs in the mirror to see if the given device
- * has the most recent txg. Only the device with the most
- * recent txg has valid information and should be booted.
- */
- for (c = 0; c < children; c++) {
- char *cdevid, *cpath;
- uint64_t tmptxg;
-
- cpath = NULL;
- cdevid = NULL;
- if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
- &cpath) != 0 && nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_DEVID, &cdevid) != 0)
- return (EINVAL);
- if ((spa_check_rootconf(cpath, cdevid, NULL,
- &tmptxg) == 0) && (tmptxg > txg)) {
- txg = tmptxg;
- VERIFY(nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_PATH, &bootpath) == 0);
+ /*
+ * Do we have a better boot device?
+ */
+ if (label_txg > *txg) {
+ *txg = label_txg;
+ *avd = vd;
}
+ nvlist_free(label);
}
-
- /* Does the best device match the one we've booted from? */
- if (bootpath) {
- cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
- return (EINVAL);
- }
-out:
- *bestconf = conf;
- return (0);
}
/*
@@ -2458,24 +3177,35 @@ out:
int
spa_import_rootpool(char *devpath, char *devid)
{
- nvlist_t *conf = NULL;
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t guid, txg;
char *pname;
int error;
- spa_t *spa;
/*
- * Get the vdev pathname and configuation from the most
- * recently updated vdev (highest txg).
+ * Read the label from the boot device and generate a configuration.
*/
- if (error = spa_get_rootconf(devpath, devid, &conf))
- goto msg_out;
-
- /*
- * Add type "root" vdev to the config.
- */
- spa_build_rootpool_config(conf);
+ config = spa_generate_rootconf(devpath, devid, &guid);
+#if defined(_OBP) && defined(_KERNEL)
+ if (config == NULL) {
+ if (strstr(devpath, "/iscsi/ssd") != NULL) {
+ /* iscsi boot */
+ get_iscsi_bootpath_phy(devpath);
+ config = spa_generate_rootconf(devpath, devid, &guid);
+ }
+ }
+#endif
+ if (config == NULL) {
+ cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+ devpath);
+ return (EIO);
+ }
- VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pname)) != NULL) {
@@ -2486,71 +3216,90 @@ spa_import_rootpool(char *devpath, char *devid)
spa_remove(spa);
}
- spa = spa_add(pname, NULL);
+ spa = spa_add(pname, config, NULL);
spa->spa_is_root = B_TRUE;
- spa->spa_load_verbatim = B_TRUE;
-
- VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0);
- mutex_exit(&spa_namespace_lock);
-
- nvlist_free(conf);
- return (0);
-
-msg_out:
- cmn_err(CE_NOTE, "\n"
- " *************************************************** \n"
- " * This device is not bootable! * \n"
- " * It is either offlined or detached or faulted. * \n"
- " * Please try to boot from a different device. * \n"
- " *************************************************** ");
-
- return (error);
-}
-#endif
-#endif /* sun */
-
-/*
- * Take a pool and insert it into the namespace as if it had been loaded at
- * boot.
- */
-int
-spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
-{
- spa_t *spa;
- char *altroot = NULL;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
- mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
mutex_exit(&spa_namespace_lock);
- return (EEXIST);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
}
- (void) nvlist_lookup_string(props,
- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
-
- spa->spa_load_verbatim = B_TRUE;
-
- VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ /*
+ * Get the boot vdev.
+ */
+ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+ (u_longlong_t)guid);
+ error = ENOENT;
+ goto out;
+ }
- if (props != NULL)
- spa_configfile_set(spa, props, B_FALSE);
+ /*
+ * Determine if there is a better boot device.
+ */
+ avd = bvd;
+ spa_alt_rootvdev(rvd, &avd, &txg);
+ if (avd != bvd) {
+ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+ "try booting from '%s'", avd->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
- spa_config_sync(spa, B_FALSE, B_TRUE);
+ /*
+ * If the boot device is part of a spare vdev then ensure that
+ * we're booting off the active spare.
+ */
+ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ !bvd->vdev_isspare) {
+ cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+ "try booting from '%s'",
+ bvd->vdev_parent->
+ vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+ error = 0;
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+out:
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&spa_namespace_lock);
- return (0);
+ nvlist_free(config);
+ return (error);
}
+#endif
+#endif /* sun */
+
/*
* Import a non-root pool into the system.
*/
int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
{
spa_t *spa;
char *altroot = NULL;
+ spa_load_state_t state = SPA_LOAD_IMPORT;
+ zpool_rewind_policy_t policy;
+ uint64_t mode = spa_mode_global;
+ uint64_t readonly = B_FALSE;
int error;
nvlist_t *nvroot;
nvlist_t **spares, **l2cache;
@@ -2560,7 +3309,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
* If a pool with this name exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
- if ((spa = spa_lookup(pool)) != NULL) {
+ if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
return (EEXIST);
}
@@ -2570,20 +3319,57 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, altroot);
- spa_activate(spa, spa_mode_global);
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+ if (readonly)
+ mode = FREAD;
+ spa = spa_add(pool, config, altroot);
+ spa->spa_import_flags = flags;
+
+ /*
+ * Verbatim import - Take a pool and insert it into the namespace
+ * as if it had been loaded at boot.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+
+ mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+
+ return (0);
+ }
+
+ spa_activate(spa, mode);
/*
* Don't start async tasks until we know everything is healthy.
*/
spa_async_suspend(spa);
+ zpool_get_rewind_policy(config, &policy);
+ if (policy.zrp_request & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
/*
* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
* because the user-supplied config is actually the one to trust when
* doing an import.
*/
- error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+ error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+ policy.zrp_request);
+
+ /*
+ * Propagate anything learned while loading the pool and pass it
+ * back to caller (i.e. rewind info, missing devices, etc).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -2660,6 +3446,14 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa->spa_l2cache.sav_sync = B_TRUE;
}
+ /*
+ * Check for any removed devices.
+ */
+ if (spa->spa_autoreplace) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+
if (spa_writeable(spa)) {
/*
* Update the config cache to include the newly-imported pool.
@@ -2667,17 +3461,23 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
+ /*
+ * It's possible that the pool was expanded while it was exported.
+ * We kick off an async task to handle this for us.
+ */
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+
mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ zvol_create_minors(pool);
+#endif
+#endif
return (0);
}
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define TRYIMPORT_NAME "$import"
-
nvlist_t *
spa_tryimport(nvlist_t *tryconfig)
{
@@ -2697,7 +3497,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Create and initialize the spa structure.
*/
mutex_enter(&spa_namespace_lock);
- spa = spa_add(TRYIMPORT_NAME, NULL);
+ spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa_activate(spa, FREAD);
/*
@@ -2705,7 +3505,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Pass TRUE for mosconfig because the user-supplied config
* is actually the one to trust when doing an import.
*/
- error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
/*
* If 'tryconfig' was at least parsable, return the current config.
@@ -2850,7 +3650,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
- spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_final_txg = spa_last_synced_txg(spa) +
+ TXG_DEFER_SIZE + 1;
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
}
@@ -2920,13 +3721,15 @@ spa_reset(char *pool)
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, id;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
@@ -2961,9 +3764,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* Transfer each new top-level vdev from vd to rvd.
*/
for (int c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
+ tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
@@ -3020,15 +3833,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
- uint64_t txg, open_txg;
+ uint64_t txg, dtl_max_txg;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
- dmu_tx_t *tx;
char *oldvdpath, *newvdpath;
int newvd_isspare;
int error;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3078,7 +3892,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* spares.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_child[1] == oldvd &&
+ oldvd->vdev_isspare &&
!spa_has_spare(spa, newvd->vdev_guid))
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -3090,23 +3904,24 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* the same (spare replaces spare, non-spare replaces
* non-spare).
*/
- if (pvd->vdev_ops == &vdev_replacing_ops)
+ if (pvd->vdev_ops == &vdev_replacing_ops &&
+ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops == &vdev_spare_ops &&
- newvd->vdev_isspare != oldvd->vdev_isspare)
+ } else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops != &vdev_spare_ops &&
- newvd->vdev_isspare)
+ }
+
+ if (newvd->vdev_isspare)
pvops = &vdev_spare_ops;
else
pvops = &vdev_replacing_ops;
}
/*
- * Compare the new device size with the replaceable/attachable
- * device size.
+ * Make sure the new device is big enough.
*/
- if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -3132,6 +3947,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
+ /* mark the device being resilvered */
+ newvd->vdev_resilvering = B_TRUE;
+
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@@ -3148,14 +3966,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_remove_child(newrootvd, newvd);
newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
vdev_add_child(pvd, newvd);
- /*
- * If newvd is smaller than oldvd, but larger than its rsize,
- * the addition of newvd may have decreased our parent's asize.
- */
- pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
tvd = newvd->vdev_top;
ASSERT(pvd->vdev_top == tvd);
ASSERT(tvd->vdev_parent == rvd);
@@ -3163,13 +3976,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_config_dirty(tvd);
/*
- * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
- * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+ * for any dmu_sync-ed blocks. It will propagate upward when
+ * spa_vdev_exit() calls vdev_dtl_reassess().
*/
- open_txg = txg + TXG_CONCURRENT_STATES - 1;
+ dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING,
- TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@@ -3185,27 +3999,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_dirty(tvd, VDD_DTL, newvd, txg);
- (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+ /*
+ * Restart the resilver
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+ /*
+ * Commit the config
+ */
+ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
- if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
- spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
- CRED(), "%s vdev=%s %s vdev=%s",
- replacing && newvd_isspare ? "spare in" :
- replacing ? "replace" : "attach", newvdpath,
- replacing ? "for" : "to", oldvdpath);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+ spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
+ "%s vdev=%s %s vdev=%s",
+ replacing && newvd_isspare ? "spare in" :
+ replacing ? "replace" : "attach", newvdpath,
+ replacing ? "for" : "to", oldvdpath);
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
- /*
- * Kick off a resilver to update newvd.
- */
- VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
return (0);
}
@@ -3224,7 +4038,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
uint64_t unspare_guid;
- size_t len;
+ char *vdpath;
+
+ ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
@@ -3255,18 +4071,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/*
- * If replace_done is specified, only remove this device if it's
- * the first child of a replacing vdev. For the 'spare' vdev, either
- * disk can be removed.
+ * Only 'replacing' or 'spare' vdevs can be replaced.
*/
- if (replace_done) {
- if (pvd->vdev_ops == &vdev_replacing_ops) {
- if (vd->vdev_id != 0)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- } else if (pvd->vdev_ops != &vdev_spare_ops) {
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- }
- }
+ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
spa_version(spa) >= SPA_VERSION_SPARES);
@@ -3293,16 +4102,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* check to see if we changed the original vdev's path to have "/old"
* at the end in spa_vdev_attach(). If so, undo that change now.
*/
- if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
- pvd->vdev_child[0]->vdev_path != NULL &&
- pvd->vdev_child[1]->vdev_path != NULL) {
- ASSERT(pvd->vdev_child[1] == vd);
- cvd = pvd->vdev_child[0];
- len = strlen(vd->vdev_path);
- if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
- strcmp(cvd->vdev_path + len, "/old") == 0) {
- spa_strfree(cvd->vdev_path);
- cvd->vdev_path = spa_strdup(vd->vdev_path);
+ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+ vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path);
+
+ for (int c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+
+ if (cvd == vd || cvd->vdev_path == NULL)
+ continue;
+
+ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+ strcmp(cvd->vdev_path + len, "/old") == 0) {
+ spa_strfree(cvd->vdev_path);
+ cvd->vdev_path = spa_strdup(vd->vdev_path);
+ break;
+ }
}
}
@@ -3312,7 +4127,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* active spare list for the pool.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+ vd->vdev_id == 0 &&
+ pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
/*
@@ -3334,7 +4150,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
/*
* Remember one of the remaining children so we can get tvd below.
*/
- cvd = pvd->vdev_child[0];
+ cvd = pvd->vdev_child[pvd->vdev_children - 1];
/*
* If we need to remove the remaining child from the list of hot spares,
@@ -3350,14 +4166,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid;
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ cvd->vdev_unspare = B_TRUE;
}
/*
* If the parent mirror/replacing vdev only has one child,
* the parent is no longer needed. Remove it from the tree.
*/
- if (pvd->vdev_children == 1)
+ if (pvd->vdev_children == 1) {
+ if (pvd->vdev_ops == &vdev_spare_ops)
+ cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
+ cvd->vdev_resilvering = B_FALSE;
+ }
+
/*
* We don't set tvd until now because the parent we just removed
@@ -3372,12 +4194,16 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_propagate_state(cvd);
/*
- * If the device we just detached was smaller than the others, it may be
- * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
- * can't fail because the existing metaslabs are already in core, so
- * there's nothing to read from disk.
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
*/
- VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
vdev_config_dirty(tvd);
@@ -3387,6 +4213,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* But first make sure we're not on any *other* txg's DTL list, to
* prevent vd from being accessed after it's freed.
*/
+ vdpath = spa_strdup(vd->vdev_path);
for (int t = 0; t < TXG_SIZE; t++)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
vd->vdev_detached = B_TRUE;
@@ -3394,31 +4221,335 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ /* hang on to the spa before we release the lock */
+ spa_open_ref(spa, FTAG);
+
error = spa_vdev_exit(spa, vd, txg, 0);
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+ "vdev=%s", vdpath);
+ spa_strfree(vdpath);
+
/*
* If this was the removal of the original device in a hot spare vdev,
* then we want to go through and remove the device from the hot spare
* list of every other pool.
*/
if (unspare) {
- spa_t *myspa = spa;
- spa = NULL;
+ spa_t *altspa = NULL;
+
mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa->spa_state != POOL_STATE_ACTIVE)
- continue;
- if (spa == myspa)
+ while ((altspa = spa_next(altspa)) != NULL) {
+ if (altspa->spa_state != POOL_STATE_ACTIVE ||
+ altspa == spa)
continue;
- spa_open_ref(spa, FTAG);
+
+ spa_open_ref(altspa, FTAG);
mutex_exit(&spa_namespace_lock);
- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
+ spa_close(altspa, FTAG);
}
mutex_exit(&spa_namespace_lock);
+
+ /* search the rest of the vdevs for spares to remove */
+ spa_vdev_resilver_done(spa);
}
+ /* all done with the spa; OK to release */
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp)
+{
+ int error = 0;
+ uint64_t txg, *glist;
+ spa_t *newspa;
+ uint_t c, children, lastlog;
+ nvlist_t **child, *nvl, *tmp;
+ dmu_tx_t *tx;
+ char *altroot = NULL;
+ vdev_t *rvd, **vml = NULL; /* vdev modify list */
+ boolean_t activate_slog;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ /* clear the log and flush everything up to now */
+ activate_slog = spa_passivate_log(spa);
+ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ error = spa_offline_log(spa);
+ txg = spa_vdev_config_enter(spa);
+
+ if (activate_slog)
+ spa_activate_log(spa);
+
+ if (error != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ /* check new spa name before going any further */
+ if (spa_lookup(newname) != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+ /*
+ * scan through all the children to ensure they're all mirrors
+ */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* first, check to ensure we've got the right child count */
+ rvd = spa->spa_root_vdev;
+ lastlog = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ /* don't count the holes & logs as children */
+ if (vd->vdev_islog || vd->vdev_ishole) {
+ if (lastlog == 0)
+ lastlog = c;
+ continue;
+ }
+
+ lastlog = 0;
+ }
+ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* next, ensure no spare or cache devices are part of the split */
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+ /* then, loop over each vdev and validate it */
+ for (c = 0; c < children; c++) {
+ uint64_t is_hole = 0;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+
+ if (is_hole != 0) {
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ continue;
+ } else {
+ error = EINVAL;
+ break;
+ }
+ }
+
+ /* which disk is going to be split? */
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ &glist[c]) != 0) {
+ error = EINVAL;
+ break;
+ }
+
+ /* look it up in the spa */
+ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ if (vml[c] == NULL) {
+ error = ENODEV;
+ break;
+ }
+
+ /* make sure there's nothing stopping the split */
+ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ vml[c]->vdev_islog ||
+ vml[c]->vdev_ishole ||
+ vml[c]->vdev_isspare ||
+ vml[c]->vdev_isl2cache ||
+ !vdev_writeable(vml[c]) ||
+ vml[c]->vdev_children != 0 ||
+ vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ error = EINVAL;
+ break;
+ }
+
+ if (vdev_dtl_required(vml[c])) {
+ error = EBUSY;
+ break;
+ }
+
+ /* we need certain info from the top level */
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ vml[c]->vdev_top->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ vml[c]->vdev_top->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ vml[c]->vdev_top->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ vml[c]->vdev_top->vdev_ashift) == 0);
+ }
+
+ if (error != 0) {
+ kmem_free(vml, children * sizeof (vdev_t *));
+ kmem_free(glist, children * sizeof (uint64_t));
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* stop writers from using the disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_TRUE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ /*
+ * Temporarily record the splitting vdevs in the spa config. This
+ * will disappear once the config is regenerated.
+ */
+ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ glist, children) == 0);
+ kmem_free(glist, children * sizeof (uint64_t));
+
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ nvl) == 0);
+ mutex_exit(&spa->spa_props_lock);
+ spa->spa_config_splitting = nvl;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ /* configure and create the new pool */
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ spa->spa_config_txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_generate_guid(NULL)) == 0);
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+ /* add the new pool to the namespace */
+ newspa = spa_add(newname, config, altroot);
+ newspa->spa_config_txg = spa->spa_config_txg;
+ spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+ /* release the spa config lock, retaining the namespace lock */
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 1);
+
+ spa_activate(newspa, spa_mode_global);
+ spa_async_suspend(newspa);
+
+#ifndef sun
+ /* mark that we are creating new spa by splitting */
+ newspa->spa_splitting_newspa = B_TRUE;
+#endif
+ /* create the new pool from the disks of the original pool */
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+#ifndef sun
+ newspa->spa_splitting_newspa = B_FALSE;
+#endif
+ if (error)
+ goto out;
+
+ /* if that worked, generate a real config for the new pool */
+ if (newspa->spa_root_vdev != NULL) {
+ VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ B_TRUE));
+ }
+
+ /* set the props */
+ if (props != NULL) {
+ spa_configfile_set(newspa, props, B_FALSE);
+ error = spa_prop_set(newspa, props);
+ if (error)
+ goto out;
+ }
+
+ /* flush everything */
+ txg = spa_vdev_config_enter(newspa);
+ vdev_config_dirty(newspa->spa_root_vdev);
+ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 2);
+
+ spa_async_resume(newspa);
+
+ /* finally, update the original pool's config */
+ txg = spa_vdev_config_enter(spa);
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0)
+ dmu_tx_abort(tx);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ vdev_split(vml[c]);
+ if (error == 0)
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+ spa, tx, "vdev=%s",
+ vml[c]->vdev_path);
+ vdev_free(vml[c]);
+ }
+ }
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa->spa_config_splitting = NULL;
+ nvlist_free(nvl);
+ if (error == 0)
+ dmu_tx_commit(tx);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 3);
+
+ /* split is complete; log a history record */
+ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+ "split new pool %s from pool %s", newname, spa_name(spa));
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+
+ /* if we're not going to mount the filesystems in userland, export */
+ if (exp)
+ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ B_FALSE, B_FALSE);
+
+ return (error);
+
+out:
+ spa_unload(newspa);
+ spa_deactivate(newspa);
+ spa_remove(newspa);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /* re-online all offlined disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_FALSE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+
+ kmem_free(vml, children * sizeof (vdev_t *));
return (error);
}
@@ -3464,19 +4595,118 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
/*
+ * Evacuate the device.
+ */
+static int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+ uint64_t txg;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as writer
+ * since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ if (vd->vdev_islog) {
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_offline_log(spa);
+ } else {
+ error = ENOTSUP;
+ }
+
+ if (error)
+ return (error);
+
+ /*
+ * The evacuation succeeded. Remove any remaining MOS metadata
+ * associated with this vdev, and wait for these changes to sync.
+ */
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+ txg = spa_vdev_config_enter(spa);
+ vd->vdev_removing = B_TRUE;
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_config_dirty(vd);
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+static void
+spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t id = vd->vdev_id;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Only remove any devices which are empty.
+ */
+ if (vd->vdev_stat.vs_alloc != 0)
+ return;
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
+ vdev_free(vd);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+
+/*
* Remove a device from the pool. Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
{
vdev_t *vd;
+ metaslab_group_t *mg;
nvlist_t **spares, **l2cache, *nv;
- uint_t nspares, nl2cache;
uint64_t txg = 0;
+ uint_t nspares, nl2cache;
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ ASSERT(spa_writeable(spa));
+
if (!locked)
txg = spa_vdev_enter(spa);
@@ -3509,6 +4739,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
spa_load_l2cache(spa);
spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * XXX - Once we have bp-rewrite this should
+ * become the common case.
+ */
+
+ mg = vd->vdev_mg;
+
+ /*
+ * Stop allocating from this vdev.
+ */
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * Attempt to evacuate the vdev.
+ */
+ error = spa_vdev_remove_evacuate(spa, vd);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /*
+ * If we couldn't evacuate the vdev, unwind.
+ */
+ if (error) {
+ metaslab_group_activate(mg);
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /*
+ * Clean up the vdev namespace.
+ */
+ spa_vdev_remove_from_namespace(spa, vd);
+
} else if (vd != NULL) {
/*
* Normal vdevs cannot be removed (yet).
@@ -3535,22 +4808,29 @@ static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
{
vdev_t *newvd, *oldvd;
- int c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
if (oldvd != NULL)
return (oldvd);
}
/*
- * Check for a completed replacement.
+ * Check for a completed replacement. We always consider the first
+ * vdev in the list to be the oldest vdev, and the last one to be
+ * the newest (see spa_vdev_attach() for how that works). In
+ * the case where the newest vdev is faulted, we will not automatically
+ * remove it after a resilver completes. This is OK as it will require
+ * user intervention to determine which disk the admin wishes to keep.
*/
- if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ if (vd->vdev_ops == &vdev_replacing_ops) {
+ ASSERT(vd->vdev_children > 1);
+
+ newvd = vd->vdev_child[vd->vdev_children - 1];
oldvd = vd->vdev_child[0];
- newvd = vd->vdev_child[1];
if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
!vdev_dtl_required(oldvd))
return (oldvd);
}
@@ -3558,15 +4838,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
/*
* Check for a completed resilver with the 'unspare' flag set.
*/
- if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
- newvd = vd->vdev_child[0];
- oldvd = vd->vdev_child[1];
+ if (vd->vdev_ops == &vdev_spare_ops) {
+ vdev_t *first = vd->vdev_child[0];
+ vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+ if (last->vdev_unspare) {
+ oldvd = first;
+ newvd = last;
+ } else if (first->vdev_unspare) {
+ oldvd = last;
+ newvd = first;
+ } else {
+ oldvd = NULL;
+ }
- if (newvd->vdev_unspare &&
+ if (oldvd != NULL &&
vdev_dtl_empty(newvd, DTL_MISSING) &&
- !vdev_dtl_required(oldvd)) {
- newvd->vdev_unspare = 0;
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
return (oldvd);
+
+ /*
+ * If there are more than two spares attached to a disk,
+ * and those spares are not required, then we want to
+ * attempt to free them up now so that they can be used
+ * by other pools. Once we're back down to a single
+ * disk+spare, we stop removing them.
+ */
+ if (vd->vdev_children > 2) {
+ newvd = vd->vdev_child[1];
+
+ if (newvd->vdev_isspare && last->vdev_isspare &&
+ vdev_dtl_empty(last, DTL_MISSING) &&
+ vdev_dtl_empty(last, DTL_OUTAGE) &&
+ !vdev_dtl_required(newvd))
+ return (newvd);
}
}
@@ -3593,9 +4899,9 @@ spa_vdev_resilver_done(spa_t *spa)
* we need to detach the parent's first child (the original hot
* spare) as well.
*/
- if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+ ppvd->vdev_children == 2) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
- ASSERT(ppvd->vdev_children == 2);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -3610,36 +4916,43 @@ spa_vdev_resilver_done(spa_t *spa)
}
/*
- * Update the stored path or FRU for this vdev. Dirty the vdev configuration,
- * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev.
*/
int
spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
boolean_t ispath)
{
vdev_t *vd;
- uint64_t txg;
+ boolean_t sync = B_FALSE;
- txg = spa_vdev_enter(spa);
+ ASSERT(spa_writeable(spa));
+
+ spa_vdev_state_enter(spa, SCL_ALL);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ return (spa_vdev_state_exit(spa, NULL, ENOENT));
if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
if (ispath) {
- spa_strfree(vd->vdev_path);
- vd->vdev_path = spa_strdup(value);
+ if (strcmp(value, vd->vdev_path) != 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(value);
+ sync = B_TRUE;
+ }
} else {
- if (vd->vdev_fru != NULL)
+ if (vd->vdev_fru == NULL) {
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ } else if (strcmp(value, vd->vdev_fru) != 0) {
spa_strfree(vd->vdev_fru);
- vd->vdev_fru = spa_strdup(value);
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ }
}
- vdev_config_dirty(vd->vdev_top);
-
- return (spa_vdev_exit(spa, NULL, txg, 0));
+ return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
}
int
@@ -3656,40 +4969,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
/*
* ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
* ==========================================================================
*/
int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (EBUSY);
+ return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
- if ((uint_t)type >= POOL_SCRUB_TYPES)
+ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
return (ENOTSUP);
/*
* If a resilver was requested, but there is no DTL on a
* writeable leaf device, we have nothing to do.
*/
- if (type == POOL_SCRUB_RESILVER &&
+ if (func == POOL_SCAN_RESILVER &&
!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
return (0);
}
- if (type == POOL_SCRUB_EVERYTHING &&
- spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
- spa->spa_dsl_pool->dp_scrub_isresilver)
- return (EBUSY);
-
- if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
- return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
- } else if (type == POOL_SCRUB_NONE) {
- return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
- } else {
- return (EINVAL);
- }
+ return (dsl_scan(spa->spa_dsl_pool, func));
}
/*
@@ -3702,7 +5013,8 @@ static void
spa_async_remove(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_remove_wanted) {
- vd->vdev_remove_wanted = 0;
+ vd->vdev_remove_wanted = B_FALSE;
+ vd->vdev_delayed_close = B_FALSE;
vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
/*
@@ -3726,7 +5038,7 @@ static void
spa_async_probe(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = 0;
+ vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
}
@@ -3735,6 +5047,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd)
}
static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath;
+
+ if (!spa->spa_autoexpand)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+}
+
+static void
spa_async_thread(void *arg)
{
spa_t *spa = arg;
@@ -3751,16 +5094,31 @@ spa_async_thread(void *arg)
* See if the config needs to be updated.
*/
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ uint64_t old_space, new_space;
+
mutex_enter(&spa_namespace_lock);
+ old_space = metaslab_class_get_space(spa_normal_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ new_space = metaslab_class_get_space(spa_normal_class(spa));
mutex_exit(&spa_namespace_lock);
+
+ /*
+ * If the pool grew as a result of the config update,
+ * then log an internal history event.
+ */
+ if (new_space != old_space) {
+ spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
+ spa, NULL,
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), new_space, new_space - old_space);
+ }
}
/*
* See if any devices need to be marked REMOVED.
*/
if (tasks & SPA_ASYNC_REMOVE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_remove(spa, spa->spa_root_vdev);
for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3769,11 +5127,17 @@ spa_async_thread(void *arg)
(void) spa_vdev_state_exit(spa, NULL, 0);
}
+ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_async_autoexpand(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
/*
* See if any devices need to be probed.
*/
if (tasks & SPA_ASYNC_PROBE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -3788,7 +5152,7 @@ spa_async_thread(void *arg)
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER)
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
/*
* Let the world know that we're done.
@@ -3834,6 +5198,7 @@ spa_async_dispatch(spa_t *spa)
void
spa_async_request(spa_t *spa, int task)
{
+ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks |= task;
mutex_exit(&spa->spa_async_lock);
@@ -3845,37 +5210,22 @@ spa_async_request(spa_t *spa, int task)
* ==========================================================================
*/
-static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
- bplist_t *bpl = &spa->spa_sync_bplist;
- dmu_tx_t *tx;
- blkptr_t blk;
- uint64_t itor = 0;
- zio_t *zio;
- int error;
- uint8_t c = 1;
-
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- ASSERT(blk.blk_birth < txg);
- zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
- }
-
- error = zio_wait(zio);
- ASSERT3U(error, ==, 0);
+ bpobj_t *bpo = arg;
+ bpobj_enqueue(bpo, bp, tx);
+ return (0);
+}
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- bplist_vacate(bpl, tx);
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zio_t *zio = arg;
- /*
- * Pre-dirty the first block so we sync to convergence faster.
- * (Usually only the first block is needed.)
- */
- dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
- dmu_tx_commit(tx);
+ zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+ zio->io_flags));
+ return (0);
}
static void
@@ -3942,7 +5292,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
- B_FALSE, B_FALSE, B_TRUE);
+ B_FALSE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
sav->sav_count) == 0);
for (i = 0; i < sav->sav_count; i++)
@@ -3982,7 +5332,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
* Set zpool properties.
*/
static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
objset_t *mos = spa->spa_meta_objset;
@@ -4023,9 +5373,11 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT(spa->spa_root != NULL);
break;
+ case ZPOOL_PROP_READONLY:
case ZPOOL_PROP_CACHEFILE:
/*
- * 'cachefile' is also a non-persisitent property.
+ * 'readonly' and 'cachefile' are also non-persisitent
+ * properties.
*/
break;
default:
@@ -4033,8 +5385,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* Set pool property values in the poolprops mos object.
*/
if (spa->spa_pool_props_object == 0) {
- objset_t *mos = spa->spa_meta_objset;
-
VERIFY((spa->spa_pool_props_object =
zap_create(mos, DMU_OT_POOL_PROPS,
DMU_OT_NONE, 0, tx)) > 0);
@@ -4081,6 +5431,15 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
case ZPOOL_PROP_FAILUREMODE:
spa->spa_failmode = intval;
break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ if (tx->tx_txg != TXG_INITIAL)
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOEXPAND);
+ break;
+ case ZPOOL_PROP_DEDUPDITTO:
+ spa->spa_dedup_ditto = intval;
+ break;
default:
break;
}
@@ -4089,8 +5448,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* log internal history if this is not a zpool create */
if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
tx->tx_txg != TXG_INITIAL) {
- spa_history_internal_log(LOG_POOL_PROPSET,
- spa, tx, cr, "%s %lld %s",
+ spa_history_log_internal(LOG_POOL_PROPSET,
+ spa, tx, "%s %lld %s",
nvpair_name(elem), intval, spa_name(spa));
}
}
@@ -4099,6 +5458,42 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
/*
+ * Perform one-time upgrade on-disk changes. spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ ASSERT(spa->spa_sync_pass == 1);
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+ dsl_pool_create_origin(dp, tx);
+
+ /* Keeping the origin open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+ dsl_pool_upgrade_clones(dp, tx);
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+ dsl_pool_upgrade_dir_clones(dp, tx);
+
+ /* Keeping the freedir open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+}
+
+/*
* Sync the specified transaction group. New blocks may be dirtied as
* part of the process, so we iterate until it converges.
*/
@@ -4107,13 +5502,15 @@ spa_sync(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bplist_t *bpl = &spa->spa_sync_bplist;
+ bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
- int dirty_vdevs;
int error;
+ VERIFY(spa_writeable(spa));
+
/*
* Lock out configuration changes.
*/
@@ -4146,8 +5543,6 @@ spa_sync(spa_t *spa, uint64_t txg)
}
spa_config_exit(spa, SCL_STATE, FTAG);
- VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
-
tx = dmu_tx_create_assigned(dp, txg);
/*
@@ -4171,34 +5566,29 @@ spa_sync(spa_t *spa, uint64_t txg)
}
}
- if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
- dsl_pool_create_origin(dp, tx);
-
- /* Keeping the origin open increases spa_minref */
- spa->spa_minref += 3;
- }
-
- if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
- dsl_pool_upgrade_clones(dp, tx);
- }
-
/*
- * If anything has changed in this txg, push the deferred frees
- * from the previous txg. If not, leave them alone so that we
- * don't generate work on an otherwise idle system.
+ * If anything has changed in this txg, or if someone is waiting
+ * for this txg to sync (eg, spa_vdev_remove()), push the
+ * deferred frees from the previous txg. If not, leave them
+ * alone so that we don't generate work on an otherwise idle
+ * system.
*/
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
!txg_list_empty(&dp->dp_dirty_dirs, txg) ||
- !txg_list_empty(&dp->dp_sync_tasks, txg))
- spa_sync_deferred_frees(spa, txg);
+ !txg_list_empty(&dp->dp_sync_tasks, txg) ||
+ ((dsl_scan_active(dp->dp_scan) ||
+ txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(defer_bpo,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY3U(zio_wait(zio), ==, 0);
+ }
/*
* Iterate to convergence.
*/
do {
- spa->spa_sync_pass++;
+ int pass = ++spa->spa_sync_pass;
spa_sync_config_object(spa, tx);
spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -4208,18 +5598,26 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_errlog_sync(spa, txg);
dsl_pool_sync(dp, txg);
- dirty_vdevs = 0;
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
- vdev_sync(vd, txg);
- dirty_vdevs++;
+ if (pass <= SYNC_PASS_DEFERRED_FREE) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(free_bpl, spa_free_sync_cb,
+ zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+ } else {
+ bplist_iterate(free_bpl, bpobj_enqueue_cb,
+ defer_bpo, tx);
}
- bplist_sync(bpl, tx);
- } while (dirty_vdevs);
+ ddt_sync(spa, txg);
+ dsl_scan_sync(dp, tx);
+
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ vdev_sync(vd, txg);
- bplist_close(bpl);
+ if (pass == 1)
+ spa_sync_upgrades(spa, tx);
- dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+ } while (dmu_objset_is_dirty(mos, txg));
/*
* Rewrite the vdev configuration (which includes the uberblock)
@@ -4242,9 +5640,8 @@ spa_sync(spa_t *spa, uint64_t txg)
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
- int c;
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
if (vd->vdev_ms_array == 0 || vd->vdev_islog)
continue;
@@ -4291,10 +5688,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_ubsync = spa->spa_uberblock;
- /*
- * Clean up the ZIL records for the synced txg.
- */
- dsl_pool_zil_clean(dp);
+ dsl_pool_sync_done(dp, txg);
/*
* Update usable space statistics.
@@ -4302,6 +5696,8 @@ spa_sync(spa_t *spa, uint64_t txg)
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
vdev_sync_done(vd, txg);
+ spa_update_dspace(spa);
+
/*
* It had better be the case that we didn't dirty anything
* since vdev_config_sync().
@@ -4309,10 +5705,13 @@ spa_sync(spa_t *spa, uint64_t txg)
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
- ASSERT(bpl->bpl_queue == NULL);
+
+ spa->spa_sync_pass = 0;
spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_handle_ignored_writes(spa);
+
/*
* If any async tasks have been requested, kick them off.
*/
@@ -4330,7 +5729,8 @@ spa_sync_allpools(void)
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -4410,6 +5810,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
void
spa_upgrade(spa_t *spa, uint64_t version)
{
+ ASSERT(spa_writeable(spa));
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -4479,7 +5881,6 @@ spa_has_active_shared_spare(spa_t *spa)
void
spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
{
-#if 0
#ifdef _KERNEL
sysevent_t *ev;
sysevent_attr_list_t *attr = NULL;
@@ -4526,5 +5927,4 @@ done:
sysevent_free_attr(attr);
sysevent_free(ev);
#endif
-#endif
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index 34050ef9150a..0b8255ef3558 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -36,6 +35,7 @@
#include <sys/sunddi.h>
#ifdef _KERNEL
#include <sys/kobj.h>
+#include <sys/zone.h>
#endif
/*
@@ -74,7 +74,6 @@ spa_config_load(void)
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
- spa_t *spa;
char *pathname;
struct _buf *file;
uint64_t fsize;
@@ -88,25 +87,21 @@ spa_config_load(void)
file = kobj_open_file(pathname);
- if (file == (struct _buf *)-1) {
- ZFS_LOG(1, "Cannot open %s.", pathname);
- goto out;
- }
+ kmem_free(pathname, MAXPATHLEN);
+
+ if (file == (struct _buf *)-1)
+ return;
- if (kobj_get_filesize(file, &fsize) != 0) {
- ZFS_LOG(1, "Cannot get size of %s.", pathname);
+ if (kobj_get_filesize(file, &fsize) != 0)
goto out;
- }
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
- if (kobj_read_file(file, buf, fsize, 0) < 0) {
- ZFS_LOG(1, "Cannot read %s.", pathname);
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
goto out;
- }
/*
* Unpack the nvlist.
@@ -114,8 +109,6 @@ spa_config_load(void)
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
- ZFS_LOG(1, "File %s loaded.", pathname);
-
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
@@ -123,7 +116,6 @@ spa_config_load(void)
mutex_enter(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
@@ -131,33 +123,27 @@ spa_config_load(void)
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
- spa = spa_add(nvpair_name(nvpair), NULL);
-
- /*
- * We blindly duplicate the configuration here. If it's
- * invalid, we will catch it when the pool is first opened.
- */
- VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ (void) spa_add(nvpair_name(nvpair), child, NULL);
}
mutex_exit(&spa_namespace_lock);
nvlist_free(nvlist);
out:
- kmem_free(pathname, MAXPATHLEN);
if (buf != NULL)
kmem_free(buf, fsize);
- if (file != (struct _buf *)-1)
- kobj_close_file(file);
+
+ kobj_close_file(file);
}
static void
spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
{
- int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
- char *buf, *temp;
size_t buflen;
+ char *buf;
vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char *temp;
/*
* If the nvlist is empty (NULL), then remove the old cachefile.
@@ -328,6 +314,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
vdev_t *rvd = spa->spa_root_vdev;
unsigned long hostid = 0;
boolean_t locked = B_FALSE;
+ uint64_t split_guid;
if (vd == NULL) {
vd = rvd;
@@ -356,7 +343,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
txg) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
spa_guid(spa)) == 0);
+#ifdef _KERNEL
+ hostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so we can't use
+ * zone_get_hostid().
+ */
(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+#endif /* _KERNEL */
if (hostid != 0) {
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
hostid) == 0);
@@ -376,12 +371,63 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
1ULL) == 0);
vd = vd->vdev_top; /* label contains top config */
+ } else {
+ /*
+ * Only add the (potentially large) split information
+ * in the mos config, and not in the vdev labels
+ */
+ if (spa->spa_config_splitting != NULL)
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ spa->spa_config_splitting) == 0);
}
- nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace.
+ */
+ vdev_top_config_generate(spa, config);
+
+ /*
+ * If we're splitting, record the original pool's guid.
+ */
+ if (spa->spa_config_splitting != NULL &&
+ nvlist_lookup_uint64(spa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+ split_guid) == 0);
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, 0);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
+ if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+
+ ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+ kmem_free(ddh, sizeof (ddt_histogram_t));
+
+ ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+ ddt_get_dedup_object_stats(spa, ddo);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+ kmem_free(ddo, sizeof (ddt_object_t));
+
+ dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+ ddt_get_dedup_stats(spa, dds);
+ VERIFY(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+ kmem_free(dds, sizeof (ddt_stat_t));
+ }
+
if (locked)
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
@@ -416,10 +462,9 @@ spa_config_update(spa_t *spa, int what)
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ms_array == 0) {
- vdev_init(tvd, txg);
- vdev_config_dirty(tvd);
- }
+ if (tvd->vdev_ms_array == 0)
+ vdev_metaslab_set_size(tvd);
+ vdev_expand(tvd, txg);
}
}
spa_config_exit(spa, SCL_ALL, FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
index e1ae4917137a..282140b3bd65 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -54,38 +53,6 @@
#include <sys/zap.h>
#include <sys/zio.h>
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-#ifdef _KERNEL
-uint64_t
-_strtonum(const char *str, char **nptr)
-{
- uint64_t val = 0;
- char c;
- int digit;
-
- while ((c = *str) != '\0') {
- if (c >= '0' && c <= '9')
- digit = c - '0';
- else if (c >= 'a' && c <= 'f')
- digit = 10 + c - 'a';
- else
- break;
-
- val *= 16;
- val += digit;
-
- str++;
- }
-
- if (nptr)
- *nptr = (char *)str;
-
- return (val);
-}
-#endif
/*
* Convert a bookmark to a string.
@@ -105,13 +72,13 @@ bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
static void
name_to_bookmark(char *buf, zbookmark_t *zb)
{
- zb->zb_objset = _strtonum(buf, &buf);
+ zb->zb_objset = strtonum(buf, &buf);
ASSERT(*buf == ':');
- zb->zb_object = _strtonum(buf + 1, &buf);
+ zb->zb_object = strtonum(buf + 1, &buf);
ASSERT(*buf == ':');
- zb->zb_level = (int)_strtonum(buf + 1, &buf);
+ zb->zb_level = (int)strtonum(buf + 1, &buf);
ASSERT(*buf == ':');
- zb->zb_blkid = _strtonum(buf + 1, &buf);
+ zb->zb_blkid = strtonum(buf + 1, &buf);
ASSERT(*buf == '\0');
}
#endif
@@ -134,7 +101,7 @@ spa_log_error(spa_t *spa, zio_t *zio)
* If we are trying to import a pool, ignore any errors, as we won't be
* writing to the pool any time soon.
*/
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
return;
mutex_enter(&spa->spa_errlist_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
index b403ccbcc444..942636b906ce 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/spa.h>
@@ -32,6 +31,7 @@
#include <sys/dmu_objset.h>
#include <sys/utsname.h>
#include <sys/sunddi.h>
+#include "zfs_comutil.h"
#ifdef _KERNEL
#include <sys/cmn_err.h>
#include <sys/zone.h>
@@ -103,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
* Figure out maximum size of history log. We set it at
* 1% of pool size, with a max of 32MB and min of 128KB.
*/
- shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off =
+ metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
@@ -187,8 +188,9 @@ spa_history_zone()
/*
* Write out a history event.
*/
+/*ARGSUSED*/
static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
history_arg_t *hap = arg2;
@@ -231,9 +233,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
gethrestime_sec()) == 0);
- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
- (uint64_t)crgetuid(cr)) == 0);
- if (hap->ha_zone[0] != '\0')
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0);
+ if (hap->ha_zone != NULL)
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
hap->ha_zone) == 0);
#ifdef _KERNEL
@@ -244,6 +245,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
hap->ha_log_type == LOG_CMD_NORMAL) {
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
history_str) == 0);
+
+ zfs_dbgmsg("command: %s", history_str);
} else {
VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
hap->ha_event) == 0);
@@ -251,6 +254,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
tx->tx_txg) == 0);
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
history_str) == 0);
+
+ zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
+ zfs_history_event_names[hap->ha_event], spa_name(spa),
+ (longlong_t)tx->tx_txg, history_str);
+
}
VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
@@ -279,10 +287,10 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
kmem_free(record_packed, reclen);
dmu_buf_rele(dbp, FTAG);
- if (hap->ha_log_type == LOG_INTERNAL) {
- kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
- kmem_free(hap, sizeof (history_arg_t));
- }
+ strfree(hap->ha_history_str);
+ if (hap->ha_zone != NULL)
+ strfree(hap->ha_zone);
+ kmem_free(hap, sizeof (history_arg_t));
}
/*
@@ -291,15 +299,32 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
int
spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
{
- history_arg_t ha;
+ history_arg_t *ha;
+ int err = 0;
+ dmu_tx_t *tx;
ASSERT(what != LOG_INTERNAL);
- ha.ha_history_str = history_str;
- ha.ha_log_type = what;
- (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
- spa, &ha, 0));
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+ ha->ha_history_str = strdup(history_str);
+ ha->ha_zone = strdup(spa_history_zone());
+ ha->ha_log_type = what;
+ ha->ha_uid = crgetuid(CRED());
+
+ /* Kick this off asynchronously; errors are ignored. */
+ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
+ spa_history_log_sync, spa, ha, 0, tx);
+ dmu_tx_commit(tx);
+
+ /* spa_history_log_sync will free ha and strings */
+ return (err);
}
/*
@@ -322,6 +347,14 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
if (!spa->spa_history)
return (ENOENT);
+ /*
+ * The history is logged asynchronously, so when they request
+ * the first chunk of history, make sure everything has been
+ * synced to disk so that we get it.
+ */
+ if (*offp == 0 && spa_writeable(spa))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
return (err);
shpp = dbp->db_data;
@@ -391,13 +424,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
return (err);
}
-void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+static void
+log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, va_list adx)
{
- history_arg_t *hap;
- char *str;
- va_list adx;
+ history_arg_t *ha;
+ va_list adx2;
/*
* If this is part of creating a pool, not everything is
@@ -406,23 +438,71 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
if (tx->tx_txg == TXG_INITIAL)
return;
- hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
- str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+ va_copy(adx2, adx);
- va_start(adx, fmt);
- (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
- va_end(adx);
+ ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+ ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx2) + 1,
+ KM_SLEEP);
+
+ va_end(adx2);
+
+ (void) vsprintf(ha->ha_history_str, fmt, adx);
- hap->ha_log_type = LOG_INTERNAL;
- hap->ha_history_str = str;
- hap->ha_event = event;
- hap->ha_zone[0] = '\0';
+ ha->ha_log_type = LOG_INTERNAL;
+ ha->ha_event = event;
+ ha->ha_zone = NULL;
+ ha->ha_uid = 0;
if (dmu_tx_is_syncing(tx)) {
- spa_history_log_sync(spa, hap, cr, tx);
+ spa_history_log_sync(spa, ha, tx);
} else {
dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
- spa_history_log_sync, spa, hap, 0, tx);
+ spa_history_log_sync, spa, ha, 0, tx);
}
- /* spa_history_log_sync() will free hap and str */
+ /* spa_history_log_sync() will free ha and strings */
+}
+
+void
+spa_history_log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ dmu_tx_t *htx = tx;
+ va_list adx;
+
+ /* create a tx if we didn't get one */
+ if (tx == NULL) {
+ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+ dmu_tx_abort(htx);
+ return;
+ }
+ }
+
+ va_start(adx, fmt);
+ log_internal(event, spa, htx, fmt, adx);
+ va_end(adx);
+
+ /* if we didn't get a tx from the caller, commit the one we made */
+ if (tx == NULL)
+ dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_version(spa_t *spa, history_internal_events_t event)
+{
+#ifdef _KERNEL
+ uint64_t current_vers = spa_version(spa);
+
+ if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
+ spa_history_log_internal(event, spa, NULL,
+ "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
+ (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+ utsname.nodename, utsname.release, utsname.version,
+ utsname.machine);
+ }
+ cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
+ event == LOG_POOL_IMPORT ? "imported" :
+ event == LOG_POOL_CREATE ? "created" : "accessed",
+ (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
+#endif
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 89e0301873cf..1709f6884c9d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,10 +40,11 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
-#include <sys/sunddi.h>
#include <sys/arc.h>
+#include <sys/ddt.h>
#include "zfs_prop.h"
/*
@@ -186,7 +186,7 @@
*
* SCL_VDEV
* Held as reader to prevent changes to the vdev tree during trivial
- * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the
+ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
* other locks, and lower than all of them, to ensure that it's safe
* to acquire regardless of caller context.
*
@@ -314,8 +314,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
void
spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
{
+ int wlocks_held = 0;
+
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (scl->scl_writer == curthread)
+ wlocks_held |= (1 << i);
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
@@ -335,6 +339,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
(void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
+ ASSERT(wlocks_held <= locks);
}
void
@@ -419,7 +424,7 @@ spa_lookup(const char *name)
* exist by calling spa_lookup() first.
*/
spa_t *
-spa_add(const char *name, const char *altroot)
+spa_add(const char *name, nvlist_t *config, const char *altroot)
{
spa_t *spa;
spa_config_dirent_t *dp;
@@ -429,29 +434,36 @@ spa_add(const char *name, const char *altroot)
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_create(&spa->spa_free_bplist[t]);
+
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
spa->spa_freeze_txg = UINT64_MAX;
spa->spa_final_txg = UINT64_MAX;
+ spa->spa_load_max_txg = UINT64_MAX;
+ spa->spa_proc = &p0;
+ spa->spa_proc_state = SPA_PROC_NONE;
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
avl_add(&spa_namespace_avl, spa);
- mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-
/*
* Set the alternate root, if there is one.
*/
@@ -467,9 +479,15 @@ spa_add(const char *name, const char *altroot)
offsetof(spa_config_dirent_t, scd_link));
dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
- dp->scd_path = spa_strdup(spa_config_path);
+ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
list_insert_head(&spa->spa_config_list, dp);
+ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+
+ if (config != NULL)
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
return (spa);
}
@@ -486,6 +504,8 @@ spa_remove(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ nvlist_free(spa->spa_config_splitting);
+
avl_remove(&spa_namespace_avl, spa);
cv_broadcast(&spa_namespace_cv);
@@ -503,24 +523,30 @@ spa_remove(spa_t *spa)
list_destroy(&spa->spa_config_list);
+ nvlist_free(spa->spa_load_info);
spa_config_set(spa, NULL);
refcount_destroy(&spa->spa_refcount);
spa_config_lock_destroy(spa);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_destroy(&spa->spa_free_bplist[t]);
+
cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_proc_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_async_lock);
- mutex_destroy(&spa->spa_scrub_lock);
- mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_errlist_lock);
- mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_proc_lock);
mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
+ mutex_destroy(&spa->spa_vdev_top_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -814,12 +840,6 @@ spa_l2cache_activate(vdev_t *vd)
mutex_exit(&spa_l2cache_lock);
}
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
- vdev_space_update(vd, space, alloc, B_FALSE);
-}
-
/*
* ==========================================================================
* SPA vdev locking
@@ -834,7 +854,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
uint64_t
spa_vdev_enter(spa_t *spa)
{
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
@@ -842,14 +875,14 @@ spa_vdev_enter(spa_t *spa)
}
/*
- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
*/
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
int config_changed = B_FALSE;
ASSERT(txg > spa_last_synced_txg(spa));
@@ -861,17 +894,28 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
- /*
- * If the config changed, notify the scrub thread that it must restart.
- */
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
- dsl_pool_scrub_restart(spa->spa_dsl_pool);
config_changed = B_TRUE;
+ spa->spa_config_generation++;
}
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+
spa_config_exit(spa, SCL_ALL, spa);
/*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag, 0);
+
+ /*
* Note: this txg_wait_synced() is important because it ensures
* that there won't be more than one config change per txg.
* This allows us to use the txg as the generation number.
@@ -891,8 +935,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
if (config_changed)
spa_config_sync(spa, B_FALSE, B_TRUE);
+}
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
@@ -901,18 +957,52 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
* Lock the given spa_t for the purpose of changing vdev state.
*/
void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
{
- spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+ int locks = SCL_STATE_ALL | oplocks;
+
+ /*
+ * Root pools may need to read of the underlying devfs filesystem
+ * when opening up a vdev. Unfortunately if we're holding the
+ * SCL_ZIO lock it will result in a deadlock when we try to issue
+ * the read from the root filesystem. Instead we "prefetch"
+ * the associated vnodes that we need prior to opening the
+ * underlying devices and cache them so that we can prevent
+ * any I/O when we are doing the actual open.
+ */
+ if (spa_is_root(spa)) {
+ int low = locks & ~(SCL_ZIO - 1);
+ int high = locks & ~low;
+
+ spa_config_enter(spa, high, spa, RW_WRITER);
+ vdev_hold(spa->spa_root_vdev);
+ spa_config_enter(spa, low, spa, RW_WRITER);
+ } else {
+ spa_config_enter(spa, locks, spa, RW_WRITER);
+ }
+ spa->spa_vdev_locks = locks;
}
int
spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
{
- if (vd != NULL)
+ boolean_t config_changed = B_FALSE;
+
+ if (vd != NULL || error == 0)
+ vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+ 0, 0, B_FALSE);
+
+ if (vd != NULL) {
vdev_state_dirty(vd->vdev_top);
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ if (spa_is_root(spa))
+ vdev_rele(spa->spa_root_vdev);
- spa_config_exit(spa, SCL_STATE_ALL, spa);
+ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+ spa_config_exit(spa, spa->spa_vdev_locks, spa);
/*
* If anything changed, wait for it to sync. This ensures that,
@@ -923,6 +1013,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
if (vd != NULL)
txg_wait_synced(spa->spa_dsl_pool, 0);
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+ mutex_exit(&spa_namespace_lock);
+ }
+
return (error);
}
@@ -982,14 +1081,13 @@ spa_rename(const char *name, const char *newname)
return (0);
}
-
/*
- * Determine whether a pool with given pool_guid exists. If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
+ * Return the spa_t associated with given pool_guid, if it exists. If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
*/
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
{
spa_t *spa;
avl_tree_t *t = &spa_namespace_avl;
@@ -1020,7 +1118,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
}
}
- return (spa != NULL);
+ return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ return (spa_by_guid(pool_guid, device_guid) != NULL);
}
char *
@@ -1055,48 +1162,36 @@ spa_get_random(uint64_t range)
return (r % range);
}
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+uint64_t
+spa_generate_guid(spa_t *spa)
{
- int d;
+ uint64_t guid = spa_get_random(-1ULL);
- if (bp == NULL) {
- (void) snprintf(buf, len, "<NULL>");
- return;
+ if (spa != NULL) {
+ while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ } else {
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
}
- if (BP_IS_HOLE(bp)) {
- (void) snprintf(buf, len, "<hole>");
- return;
- }
+ return (guid);
+}
+
+void
+sprintf_blkptr(char *buf, const blkptr_t *bp)
+{
+ char *type = NULL;
+ char *checksum = NULL;
+ char *compress = NULL;
- (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
- (u_longlong_t)BP_GET_LEVEL(bp),
- dmu_ot[BP_GET_TYPE(bp)].ot_name,
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp));
-
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- const dva_t *dva = &bp->blk_dva[d];
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "DVA[%d]=<%llu:%llx:%llx> ", d,
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
- (u_longlong_t)DVA_GET_ASIZE(dva));
+ if (bp != NULL) {
+ type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+ checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
- BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
- BP_IS_GANG(bp) ? "gang" : "contiguous",
- (u_longlong_t)bp->blk_birth,
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
}
void
@@ -1233,59 +1328,55 @@ spa_first_txg(spa_t *spa)
return (spa->spa_first_txg);
}
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+ return (spa->spa_syncing_txg);
+}
+
pool_state_t
spa_state(spa_t *spa)
{
return (spa->spa_state);
}
-uint64_t
-spa_freeze_txg(spa_t *spa)
+spa_load_state_t
+spa_load_state(spa_t *spa)
{
- return (spa->spa_freeze_txg);
+ return (spa->spa_load_state);
}
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
uint64_t
-spa_get_alloc(spa_t *spa)
+spa_freeze_txg(spa_t *spa)
{
- return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+ return (spa->spa_freeze_txg);
}
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
+/* ARGSUSED */
uint64_t
-spa_get_space(spa_t *spa)
+spa_get_asize(spa_t *spa, uint64_t lsize)
{
- return (spa->spa_root_vdev->vdev_stat.vs_space);
+ /*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().
+ */
+ return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
}
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
uint64_t
spa_get_dspace(spa_t *spa)
{
- if (spa->spa_deflate)
- return (spa->spa_root_vdev->vdev_stat.vs_dspace);
- else
- return (spa->spa_root_vdev->vdev_stat.vs_space);
+ return (spa->spa_dspace);
}
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
+void
+spa_update_dspace(spa_t *spa)
{
- /*
- * For now, the worst case is 512-byte RAID-Z blocks, in which
- * case the space requirement is exactly 2x; so just assume that.
- * Add to this the fact that we can have up to 3 DVAs per bp, and
- * we have to multiply by a total of 6x.
- */
- return (lsize * 6);
+ spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+ ddt_get_dedup_dspace(spa);
}
/*
@@ -1310,6 +1401,24 @@ spa_version(spa_t *spa)
return (spa->spa_ubsync.ub_version);
}
+boolean_t
+spa_deflate(spa_t *spa)
+{
+ return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+ return (spa->spa_log_class);
+}
+
int
spa_max_replication(spa_t *spa)
{
@@ -1323,24 +1432,52 @@ spa_max_replication(spa_t *spa)
return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
+int
+spa_prev_software_version(spa_t *spa)
+{
+ return (spa->spa_prev_software_version);
+}
+
uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
- int sz = 0, i;
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
- if (!spa->spa_deflate)
- return (BP_GET_ASIZE(bp));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- vdev_t *vd =
- vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- if (vd)
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
- SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ if (asize != 0 && spa->spa_deflate) {
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
spa_config_exit(spa, SCL_VDEV, FTAG);
- return (sz);
+
+ return (dsize);
}
/*
@@ -1442,9 +1579,18 @@ spa_has_slogs(spa_t *spa)
return (spa->spa_log_class->mc_rotor != NULL);
}
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+ return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+ spa->spa_log_state = state;
+}
+
boolean_t
spa_is_root(spa_t *spa)
{
@@ -1462,3 +1608,69 @@ spa_mode(spa_t *spa)
{
return (spa->spa_mode);
}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+ return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+ return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+ return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+ return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+ /* data not stored on disk */
+ spa->spa_scan_pass_start = gethrestime_sec();
+ spa->spa_scan_pass_exam = 0;
+ vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ return (ENOENT);
+ bzero(ps, sizeof (pool_scan_stat_t));
+
+ /* data stored on disk */
+ ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_start_time = scn->scn_phys.scn_start_time;
+ ps->pss_end_time = scn->scn_phys.scn_end_time;
+ ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_processed = scn->scn_phys.scn_processed;
+ ps->pss_errors = scn->scn_phys.scn_errors;
+ ps->pss_state = scn->scn_phys.scn_state;
+
+ /* data not stored on disk */
+ ps->pss_pass_start = spa->spa_scan_pass_start;
+ ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index d0251419cbc4..1ce7b2a3d466 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -258,8 +258,10 @@ space_map_load_wait(space_map_t *sm)
{
ASSERT(MUTEX_HELD(sm->sm_lock));
- while (sm->sm_loading)
+ while (sm->sm_loading) {
+ ASSERT(!sm->sm_loaded);
cv_wait(&sm->sm_load_cv, sm->sm_lock);
+ }
}
/*
@@ -276,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
int error = 0;
ASSERT(MUTEX_HELD(sm->sm_lock));
-
- space_map_load_wait(sm);
-
- if (sm->sm_loaded)
- return (0);
+ ASSERT(!sm->sm_loaded);
+ ASSERT(!sm->sm_loading);
sm->sm_loading = B_TRUE;
end = smo->smo_objsize;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index f52851d69f46..8f189c62d31d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ARC_H
@@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
- krwlock_t b_lock;
+ kmutex_t b_evict_lock;
+ krwlock_t b_data_lock;
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
@@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb);
int arc_released(arc_buf_t *buf);
int arc_has_callback(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
@@ -99,27 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf);
int arc_referenced(arc_buf_t *buf);
#endif
-typedef struct writeprops {
- dmu_object_type_t wp_type;
- uint8_t wp_level;
- uint8_t wp_copies;
- uint8_t wp_dncompress, wp_oscompress;
- uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
int arc_buf_evict(arc_buf_t *buf);
@@ -135,7 +127,7 @@ void arc_fini(void);
* Level 2 ARC
*/
-void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_init(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
index cdb93a6c35a3..471be9047ec2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -19,68 +19,36 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_BPLIST_H
#define _SYS_BPLIST_H
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
#include <sys/zfs_context.h>
+#include <sys/spa.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct bplist_phys {
- /*
- * This is the bonus buffer for the dead lists. The object's
- * contents is an array of bpl_entries blkptr_t's, representing
- * a total of bpl_bytes physical space.
- */
- uint64_t bpl_entries;
- uint64_t bpl_bytes;
- uint64_t bpl_comp;
- uint64_t bpl_uncomp;
-} bplist_phys_t;
-
-#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
- blkptr_t bpq_blk;
- void *bpq_next;
-} bplist_q_t;
+typedef struct bplist_entry {
+ blkptr_t bpe_blk;
+ list_node_t bpe_node;
+} bplist_entry_t;
typedef struct bplist {
kmutex_t bpl_lock;
- objset_t *bpl_mos;
- uint64_t bpl_object;
- uint8_t bpl_blockshift;
- uint8_t bpl_bpshift;
- uint8_t bpl_havecomp;
- bplist_q_t *bpl_queue;
- bplist_phys_t *bpl_phys;
- dmu_buf_t *bpl_dbuf;
- dmu_buf_t *bpl_cached_dbuf;
+ list_t bpl_list;
} bplist_t;
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
- uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
- uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+ void *arg, dmu_tx_t *tx);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
new file mode 100644
index 000000000000..3771a9541aa7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPOBJ_H
+#define _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpo_entries blkptr_t's, representing
+ * a total of bpo_bytes physical space.
+ */
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+ kmutex_t bpo_lock;
+ objset_t *bpo_os;
+ uint64_t bpo_object;
+ int bpo_epb;
+ uint8_t bpo_havecomp;
+ uint8_t bpo_havesubobj;
+ bpobj_phys_t *bpo_phys;
+ dmu_buf_t *bpo_dbuf;
+ dmu_buf_t *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 7e2754d000b4..cf1bbc030f45 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DBUF_H
#define _SYS_DBUF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
@@ -35,12 +32,12 @@
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
#endif
-#define DB_BONUS_BLKID (-1ULL)
#define IN_DMU_SYNC 2
/*
@@ -55,25 +52,28 @@ extern "C" {
#define DB_RF_CACHED (1 << 5)
/*
- * The state transition diagram for dbufs looks like:
+ * The simplified state transition diagram for dbufs looks like:
*
* +----> READ ----+
* | |
* | V
* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
- * | ^
- * | |
- * +----> FILL ----+
+ * | ^ ^
+ * | | |
+ * +----> FILL ----+ |
+ * | |
+ * | |
+ * +--------> NOFILL -------+
*/
typedef enum dbuf_states {
DB_UNCACHED,
DB_FILL,
+ DB_NOFILL,
DB_READ,
DB_CACHED,
DB_EVICTING
} dbuf_states_t;
-struct objset_impl;
struct dnode;
struct dmu_tx;
@@ -83,9 +83,6 @@ struct dmu_tx;
* etc.
*/
-#define LIST_LINK_INACTIVE(link) \
- ((link)->list_next == NULL && (link)->list_prev == NULL)
-
struct dmu_buf_impl;
typedef enum override_states {
@@ -132,6 +129,7 @@ typedef struct dbuf_dirty_record {
arc_buf_t *dr_data;
blkptr_t dr_overridden_by;
override_states_t dr_override_state;
+ uint8_t dr_copies;
} dl;
} dt;
} dbuf_dirty_record_t;
@@ -146,18 +144,20 @@ typedef struct dmu_buf_impl {
dmu_buf_t db;
/* the objset we belong to */
- struct objset_impl *db_objset;
+ struct objset *db_objset;
/*
- * the dnode we belong to (NULL when evicted)
+ * handle to safely access the dnode we belong to (NULL when evicted)
*/
- struct dnode *db_dnode;
+ struct dnode_handle *db_dnode_handle;
/*
* our parent buffer; if the dnode points to us directly,
- * db_parent == db_dnode->dn_dbuf
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
* only accessed by sync thread ???
* (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
*/
struct dmu_buf_impl *db_parent;
@@ -240,6 +240,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
@@ -253,17 +257,19 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
@@ -271,30 +277,53 @@ void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define DB_GET_SPA(_spa_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_spa_p) = __dn->dn_objset->os_spa; \
+ DB_DNODE_EXIT(_db); \
+}
+#define DB_GET_OBJSET(_os_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_os_p) = __dn->dn_objset; \
+ DB_DNODE_EXIT(_db); \
+}
+
void dbuf_init(void);
void dbuf_fini(void);
-#define DBUF_IS_METADATA(db) \
- ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_IS_METADATA(_db) \
+ (dbuf_is_metadata(_db))
-#define DBUF_GET_BUFC_TYPE(db) \
- (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-#define DBUF_IS_CACHEABLE(db) \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-#define DBUF_IS_L2CACHEABLE(db) \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
@@ -322,10 +351,10 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ sprintf_blkptr(__blkbuf, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
+ } \
_NOTE(CONSTCOND) } while (0)
#define DBUF_VERIFY(db) dbuf_verify(db)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
new file mode 100644
index 000000000000..9724d6ecebb0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+ DDT_TYPE_ZAP = 0,
+ DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+ DDT_CLASS_DITTO = 0,
+ DDT_CLASS_DUPLICATE,
+ DDT_CLASS_UNIQUE,
+ DDT_CLASSES
+};
+
+#define DDT_TYPE_CURRENT 0
+
+#define DDT_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_COMPRESS_FUNCTION_MASK 0x7f
+
+/*
+ * On-disk ddt entry: key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+ zio_cksum_t ddk_cksum; /* 256-bit block checksum */
+ uint64_t ddk_prop; /* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define DDK_GET_LSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_LSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_PSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_PSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
+#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+ dva_t ddp_dva[SPA_DVAS_PER_BP];
+ uint64_t ddp_refcnt;
+ uint64_t ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+ DDT_PHYS_DITTO = 0,
+ DDT_PHYS_SINGLE = 1,
+ DDT_PHYS_DOUBLE = 2,
+ DDT_PHYS_TRIPLE = 3,
+ DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+ ddt_key_t dde_key;
+ ddt_phys_t dde_phys[DDT_PHYS_TYPES];
+ zio_t *dde_lead_zio[DDT_PHYS_TYPES];
+ void *dde_repair_data;
+ enum ddt_type dde_type;
+ enum ddt_class dde_class;
+ uint8_t dde_loading;
+ uint8_t dde_loaded;
+ kcondvar_t dde_cv;
+ avl_node_t dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+ kmutex_t ddt_lock;
+ avl_tree_t ddt_tree;
+ avl_tree_t ddt_repair_tree;
+ enum zio_checksum ddt_checksum;
+ spa_t *ddt_spa;
+ objset_t *ddt_os;
+ uint64_t ddt_stat_object;
+ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+ ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+ avl_node_t ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+ uint64_t ddb_class;
+ uint64_t ddb_type;
+ uint64_t ddb_checksum;
+ uint64_t ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+ char ddt_op_name[32];
+ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+ boolean_t prehash);
+ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+ void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+ ddt_entry_t *dde);
+ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ uint64_t *walk);
+ uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define DDT_NAMELEN 80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+ uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+ const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+ uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+ const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 08c30c8ed015..4f91a91a5d61 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -19,15 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_DMU_H
#define _SYS_DMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file describes the interface that the DMU provides for its
* consumers.
@@ -39,12 +38,14 @@
#include <sys/types.h>
#include <sys/param.h>
#include <sys/cred.h>
+#include <sys/time.h>
#ifdef __cplusplus
extern "C" {
#endif
struct uio;
+struct xuio;
struct page;
struct vnode;
struct spa;
@@ -60,8 +61,9 @@ struct drr_end;
struct zbookmark;
struct spa;
struct nvlist;
-struct objset_impl;
struct arc_buf;
+struct zio_prop;
+struct sa_handle;
struct file;
typedef struct objset objset_t;
@@ -75,8 +77,8 @@ typedef enum dmu_object_type {
DMU_OT_OBJECT_ARRAY, /* UINT64 */
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
- DMU_OT_BPLIST, /* UINT64 */
- DMU_OT_BPLIST_HDR, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
/* spa: */
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
DMU_OT_SPACE_MAP, /* UINT64 */
@@ -116,9 +118,22 @@ typedef enum dmu_object_type {
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */
- DMU_OT_SCRUB_QUEUE, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -141,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size);
void zfs_acl_byteswap(void *buf, size_t size);
void zfs_znode_byteswap(void *buf, size_t size);
-#define DS_MODE_NOHOLD 0 /* internal use only */
-#define DS_MODE_USER 1 /* simple access, no special needs */
-#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */
-#define DS_MODE_TYPE_MASK 0x3
-#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK)
-#define DS_MODE_READONLY 0x8
-#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
-#define DS_MODE_INCONSISTENT 0x10
-#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT)
-
#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
@@ -163,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
+#define DMU_DEADLIST_OBJECT (-3ULL)
/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define DMU_BONUS_BLKID (-1ULL)
+#define DMU_SPILL_BLKID (-2ULL)
+/*
* Public routines to create, destroy, open, and close objsets.
*/
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
- boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
int dmu_objset_rename(const char *name, const char *newname,
boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
@@ -202,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_DIRECTORY_OBJECT 1
#define DMU_POOL_CONFIG "config"
#define DMU_POOL_ROOT_DATASET "root_dataset"
-#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
#define DMU_POOL_ERRLOG_LAST "errlog_last"
#define DMU_POOL_SPARES "spares"
@@ -210,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_HISTORY "history"
#define DMU_POOL_PROPS "pool_props"
#define DMU_POOL_L2CACHE "l2cache"
-
-/* 4x8 zbookmark_t */
-#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define DMU_POOL_SCRUB_QUEUE "scrub_queue"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define DMU_POOL_SCRUB_FUNC "scrub_func"
-/* 1x8 count */
-#define DMU_POOL_SCRUB_ERRORS "scrub_errors"
+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_DDT "DDT-%s-%s-%s"
+#define DMU_POOL_DDT_STATS "DDT-statistics"
+#define DMU_POOL_CREATION_VERSION "creation_version"
+#define DMU_POOL_SCAN "scan"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
/*
* Allocate an object from this objset. The range of object numbers
@@ -307,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
/*
- * Decide how many copies of a given block we should make. Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
*/
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
- dmu_object_type_t ot);
+#define WP_NOFILL 0x1
+#define WP_DMU_SYNC 0x2
+#define WP_SPILL 0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+ struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -325,6 +334,18 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+ void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
/*
* Obtain the DMU buffer from the specified object which contains the
@@ -341,7 +362,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **);
+ void *tag, dmu_buf_t **, int flags);
void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
void dmu_buf_rele(dmu_buf_t *db, void *tag);
uint64_t dmu_buf_refcount(dmu_buf_t *db);
@@ -438,12 +459,35 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+
+/*
* Free up the data blocks for a defined range of a file. If size is
* zero, the range from offset to end-of-file is freed.
*/
@@ -465,15 +509,28 @@ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
@@ -484,19 +541,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
uint64_t len);
typedef struct dmu_object_info {
- /* All sizes are in bytes. */
+ /* All sizes are in bytes unless otherwise indicated. */
uint32_t doi_data_block_size;
uint32_t doi_metadata_block_size;
- uint64_t doi_bonus_size;
dmu_object_type_t doi_type;
dmu_object_type_t doi_bonus_type;
+ uint64_t doi_bonus_size;
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_pad[5];
- /* Values below are number of 512-byte blocks. */
- uint64_t doi_physical_blks; /* data + metadata */
- uint64_t doi_max_block_offset;
+ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
+ uint64_t doi_max_offset;
+ uint64_t doi_fill_count; /* number of non-empty blocks */
} dmu_object_info_t;
typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -565,6 +622,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
*/
uint64_t dmu_objset_fsid_guid(objset_t *os);
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
int dmu_objset_is_snapshot(objset_t *os);
extern struct spa *dmu_objset_spa(objset_t *os);
@@ -574,6 +636,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_syncprop(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -581,9 +645,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp);
-typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
- void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
- dmu_tx_t *tx);
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+ void *bonus, uint64_t *userp, uint64_t *groupp);
extern void dmu_objset_register_type(dmu_objset_type_t ost,
objset_used_cb_t *cb);
extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
@@ -604,9 +667,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
* storage when the write completes this new data does not become a
* permanent part of the file until the associated transaction commits.
*/
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
- struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+ struct zilog *zgd_zilog;
+ struct blkptr *zgd_bp;
+ dmu_buf_t *zgd_db;
+ struct rl *zgd_rl;
+ void *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
/*
* Find the next hole or data block in file starting at *off
@@ -641,15 +715,19 @@ typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_real_ds;
struct drr_begin *drc_drrb;
char *drc_tosnap;
+ char *drc_top_ds;
boolean_t drc_newfs;
boolean_t drc_force;
} dmu_recv_cookie_t;
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp);
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep);
int dmu_recv_end(dmu_recv_cookie_t *drc);
-void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
+
+int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct file *fp,
+ offset_t *off);
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
index 96ce688e1551..2cb7f121cc0d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,6 +29,7 @@
#include <sys/txg_impl.h>
#include <sys/zio.h>
#include <sys/dnode.h>
+#include <sys/kstat.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -210,11 +211,11 @@ extern "C" {
*
* ds_lock
* protects:
- * ds_user_ptr
- * ds_user_evice_func
+ * ds_objset
* ds_open_refcount
* ds_snapname
* ds_phys accounting
+ * ds_phys userrefs zapobj
* ds_reserved
* held from:
* dsl_dataset_*
@@ -232,6 +233,39 @@ extern "C" {
struct objset;
struct dmu_pool;
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index a8022d2eaa8f..d687642b3f14 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_DMU_OBJSET_H
#define _SYS_DMU_OBJSET_H
@@ -33,18 +34,23 @@
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
#endif
+extern krwlock_t os_lock;
+
struct dsl_dataset;
struct dmu_tx;
-struct objset_impl;
#define OBJSET_PHYS_SIZE 2048
#define OBJSET_OLD_PHYS_SIZE 1024
+#define OBJSET_BUF_HAS_USERUSED(buf) \
+ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
typedef struct objset_phys {
@@ -59,26 +65,32 @@ typedef struct objset_phys {
} objset_phys_t;
struct objset {
- struct objset_impl *os;
- int os_mode;
-};
-
-typedef struct objset_impl {
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
spa_t *os_spa;
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
- dnode_t *os_meta_dnode;
- dnode_t *os_userused_dnode;
- dnode_t *os_groupused_dnode;
+ /*
+ * The following "special" dnodes have no parent and are exempt from
+ * dnode_move(), but they root their descendents in this objset using
+ * handles anyway, so that all access to dnodes from dbufs consistently
+ * uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
- objset_t os;
- uint8_t os_checksum; /* can change, under dsl_dir's locks */
- uint8_t os_compress; /* can change, under dsl_dir's locks */
- uint8_t os_copies; /* can change, under dsl_dir's locks */
- uint8_t os_primary_cache; /* can change, under dsl_dir's locks */
- uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */
+
+ /* can change, under dsl_dir's locks: */
+ uint8_t os_checksum;
+ uint8_t os_compress;
+ uint8_t os_copies;
+ uint8_t os_dedup_checksum;
+ uint8_t os_dedup_verify;
+ uint8_t os_logbias;
+ uint8_t os_primary_cache;
+ uint8_t os_secondary_cache;
+ uint8_t os_sync;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -101,51 +113,69 @@ typedef struct objset_impl {
/* stuff we store for the user */
kmutex_t os_user_ptr_lock;
void *os_user_ptr;
-} objset_impl_t;
+ /* SA layout/attribute registration */
+ sa_os_t *os_sa;
+};
+
+#define DMU_META_OBJSET 0
#define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
#define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
(os)->os_secondary_cache == ZFS_CACHE_METADATA)
/* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent, uint64_t flags,
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
- boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+ uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
void dmu_objset_stats(objset_t *os, nvlist_t *nv);
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp);
uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
int flags);
int dmu_objset_find_spa(spa_t *spa, const char *name,
int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(char *name, void *arg);
+int dmu_objset_prefetch(const char *name, void *arg);
void dmu_objset_byteswap(void *buf, size_t size);
int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
/* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+boolean_t dmu_objset_is_dirty_anywhere(objset_t *os);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
- objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
-boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+ objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
index 3e026891153c..5b326cd99c09 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DMU_TRAVERSE_H
@@ -36,19 +35,27 @@ extern "C" {
struct dnode_phys;
struct dsl_dataset;
+struct zilog;
+struct arc_buf;
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
- const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+ void *arg);
#define TRAVERSE_PRE (1<<0)
#define TRAVERSE_POST (1<<1)
#define TRAVERSE_PREFETCH_METADATA (1<<2)
#define TRAVERSE_PREFETCH_DATA (1<<3)
#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define TRAVERSE_HARD (1<<4)
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
- int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+/* Special traverse error return value to indicate skipping of children */
+#define TRAVERSE_VISIT_NO_CHILDREN -1
+
+int traverse_dataset(struct dsl_dataset *ds,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
index 6aaf35dc038f..bbc66347d525 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMU_TX_H
#define _SYS_DMU_TX_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/txg.h>
#include <sys/refcount.h>
@@ -58,6 +56,7 @@ struct dmu_tx {
txg_handle_t tx_txgh;
void *tx_tempreserve_cookie;
struct dmu_tx_hold *tx_needassign_txh;
+ list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
uint8_t tx_anyobj;
int tx_err;
#ifdef ZFS_DEBUG
@@ -77,6 +76,7 @@ enum dmu_tx_hold_type {
THT_FREE,
THT_ZAP,
THT_SPACE,
+ THT_SPILL,
THT_NUMTYPES
};
@@ -97,6 +97,11 @@ typedef struct dmu_tx_hold {
#endif
} dmu_tx_hold_t;
+typedef struct dmu_tx_callback {
+ list_node_t dcb_node; /* linked to tx_callbacks list */
+ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */
+ void *dcb_data; /* caller private data */
+} dmu_tx_callback_t;
/*
* These routines are defined in dmu.h, and are called by the user.
@@ -108,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx);
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
/*
* These routines are defined in dmu_spa.h, and are called by the SPA.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 48e4da8cd647..9ad4be36bf85 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DNODE_H
@@ -33,6 +32,7 @@
#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -63,6 +63,18 @@ extern "C" {
#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define DN_ID_CHKED_BONUS 0x1
+#define DN_ID_CHKED_SPILL 0x2
+#define DN_ID_OLD_EXIST 0x4
+#define DN_ID_NEW_EXIST 0x8
+
+/*
* Derived constants.
*/
#define DNODE_SIZE (1 << DNODE_SHIFT)
@@ -70,10 +82,12 @@ extern "C" {
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
+#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
/* The +2 here is a cheesy way to round up */
#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
@@ -88,7 +102,7 @@ extern "C" {
#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
struct dmu_buf_impl;
-struct objset_impl;
+struct objset;
struct zio;
enum dnode_dirtycontext {
@@ -101,6 +115,9 @@ enum dnode_dirtycontext {
#define DNODE_FLAG_USED_BYTES (1<<0)
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
@@ -121,7 +138,8 @@ typedef struct dnode_phys {
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
} dnode_phys_t;
typedef struct dnode {
@@ -136,9 +154,10 @@ typedef struct dnode {
list_node_t dn_link;
/* immutable: */
- struct objset_impl *dn_objset;
+ struct objset *dn_objset;
uint64_t dn_object;
struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
@@ -155,15 +174,21 @@ typedef struct dnode {
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint8_t dn_next_bonustype[TXG_SIZE];
+ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
@@ -183,33 +208,60 @@ typedef struct dnode {
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ list_t dn_dbufs; /* descendent dbufs */
+
+ /* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+ boolean_t dn_have_spill; /* have spill or are spilling */
+
/* parent IO for current sync write */
zio_t *dn_zio;
/* used in syncing context */
- dnode_phys_t *dn_oldphys;
+ uint64_t dn_oldused; /* old phys used bytes */
+ uint64_t dn_oldflags; /* old phys dn_flags */
+ uint64_t dn_olduid, dn_oldgid;
+ uint64_t dn_newuid, dn_newgid;
+ int dn_id_flags;
/* holds prefetch structure */
struct zfetch dn_zfetch;
} dnode_t;
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
typedef struct free_range {
avl_node_t fr_node;
uint64_t fr_blkid;
uint64_t fr_nblks;
} free_range_t;
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
- uint64_t object);
-void dnode_special_close(dnode_t *dn);
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-int dnode_hold(struct objset_impl *dd, uint64_t object,
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index a1c2896e3cfb..22733d070e8b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DATASET_H
@@ -33,6 +32,7 @@
#include <sys/bplist.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
#ifdef __cplusplus
extern "C" {
@@ -42,8 +42,6 @@ struct dsl_dataset;
struct dsl_dir;
struct dsl_pool;
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
#define DS_FLAG_INCONSISTENT (1ULL<<0)
#define DS_IS_INCONSISTENT(ds) \
((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
@@ -63,6 +61,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
+#define DS_IS_DEFER_DESTROY(ds) \
+ ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -77,7 +83,7 @@ typedef struct dsl_dataset_phys {
uint64_t ds_num_children; /* clone/snap children; ==0 for head */
uint64_t ds_creation_time; /* seconds since 1970 */
uint64_t ds_creation_txg;
- uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */
+ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
uint64_t ds_used_bytes;
uint64_t ds_compressed_bytes;
uint64_t ds_uncompressed_bytes;
@@ -93,7 +99,8 @@ typedef struct dsl_dataset_phys {
blkptr_t ds_bp;
uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
- uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
+ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
+ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
@@ -106,10 +113,13 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
- uint64_t ds_origin_txg;
/* has internal locking: */
- bplist_t ds_deadlist;
+ dsl_deadlist_t ds_deadlist;
+ bplist_t ds_pending_deadlist;
+
+ /* to protect against multiple concurrent incremental recv */
+ kmutex_t ds_recvlock;
/* protected by lock on pool's dp_dirty_datasets list */
txg_node_t ds_dirty_link;
@@ -120,8 +130,8 @@ typedef struct dsl_dataset {
* Protected by ds_lock:
*/
kmutex_t ds_lock;
- void *ds_user_ptr;
- dsl_dataset_evict_func_t *ds_user_evict_func;
+ objset_t *ds_objset;
+ uint64_t ds_userrefs;
/*
* ds_owner is protected by the ds_rwlock and the ds_lock
@@ -143,7 +153,32 @@ typedef struct dsl_dataset {
char ds_snapname[MAXNAMELEN];
} dsl_dataset_t;
-#define dsl_dataset_is_snapshot(ds) \
+struct dsl_ds_destroyarg {
+ dsl_dataset_t *ds; /* ds to destroy */
+ dsl_dataset_t *rm_origin; /* also remove our origin? */
+ boolean_t is_origin_rm; /* set if removing origin snap */
+ boolean_t defer; /* destroy -d requested? */
+ boolean_t releasing; /* destroying due to release? */
+ boolean_t need_prep; /* do we need to retry due to EBUSY? */
+};
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define MAX_TAG_PREFIX_LEN 17
+
+struct dsl_ds_holdarg {
+ dsl_sync_task_group_t *dstg;
+ char *htag;
+ char *snapname;
+ boolean_t recursive;
+ boolean_t gotone;
+ boolean_t temphold;
+ char failed[MAXPATHLEN];
+};
+
+#define dsl_dataset_is_snapshot(ds) \
((ds)->ds_phys->ds_num_children != 0)
#define DS_UNIQUE_IS_ACCURATE(ds) \
@@ -152,36 +187,43 @@ typedef struct dsl_dataset {
int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, int flags, void *owner,
- dsl_dataset_t **dsp);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+ void *tag, dsl_dataset_t **dsp);
int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
- int flags, void *owner, dsl_dataset_t **);
+ boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
- void *owner);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+ void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
dsl_checkfunc_t dsl_dataset_destroy_check;
dsl_syncfunc_t dsl_dataset_destroy_sync;
dsl_checkfunc_t dsl_dataset_snapshot_check;
dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
+dsl_syncfunc_t dsl_dataset_user_hold_sync;
int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
boolean_t force);
-
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive, boolean_t temphold, int cleanup_fd);
+int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+ char *htag, boolean_t retry);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
@@ -192,10 +234,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
-boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+ uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@@ -211,13 +255,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
uint64_t asize, uint64_t inflight, uint64_t *used,
uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
-void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
-int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
- dmu_tx_t *tx);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+ uint64_t quota);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
#ifdef ZFS_DEBUG
#define dprintf_ds(ds, fmt, ...) do { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
new file mode 100644
index 000000000000..d2c16d72c17e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DEADLIST_H
+#define _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+ objset_t *dl_os;
+ uint64_t dl_object;
+ avl_tree_t dl_tree;
+ boolean_t dl_havetree;
+ struct dmu_buf *dl_dbuf;
+ dsl_deadlist_phys_t *dl_phys;
+ kmutex_t dl_lock;
+
+ /* if it's the old on-disk format: */
+ bpobj_t dl_bpobj;
+ boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+ avl_node_t dle_node;
+ uint64_t dle_mintxg;
+ bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+ uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
index b064c9228ec8..73c43bd23879 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
@@ -53,6 +52,9 @@ extern "C" {
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
/*
* Note: the names of properties that are marked delegatable are also
@@ -62,6 +64,7 @@ extern "C" {
int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
index 56d06388cc72..2191635dd813 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
@@ -70,7 +69,8 @@ typedef struct dsl_dir_phys {
uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
uint64_t dd_flags;
uint64_t dd_used_breakdown[DD_USED_NUM];
- uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+ uint64_t dd_clones; /* dsl_dir objects */
+ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
} dsl_dir_phys_t;
struct dsl_dir {
@@ -89,6 +89,8 @@ struct dsl_dir {
/* Protected by dd_lock */
kmutex_t dd_lock;
list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ uint64_t dd_origin_txg;
/* gross estimate of space used by in-flight tx's */
uint64_t dd_tempreserved[TXG_SIZE];
@@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+ uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation);
int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
+#define XLATION_DIR_NAME "$XLATION"
+#define FREE_DIR_NAME "$FREE"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
index d8da295f3386..7d25bd7c020d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_POOL_H
@@ -32,6 +31,9 @@
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -42,12 +44,7 @@ struct dsl_dir;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
-
-enum scrub_func {
- SCRUB_FUNC_NONE,
- SCRUB_FUNC_CLEAN,
- SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;
/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
@@ -75,6 +72,7 @@ typedef struct dsl_pool {
struct objset *dp_meta_objset;
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
+ struct dsl_dir *dp_free_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
@@ -83,25 +81,18 @@ typedef struct dsl_pool {
blkptr_t dp_meta_rootbp;
list_t dp_synced_datasets;
hrtime_t dp_read_overhead;
- uint64_t dp_throughput;
+ uint64_t dp_throughput; /* bytes per millisec */
uint64_t dp_write_limit;
+ uint64_t dp_tmp_userrefs_obj;
+ bpobj_t dp_free_bpobj;
+
+ struct dsl_scan *dp_scan;
/* Uses dp_lock */
kmutex_t dp_lock;
uint64_t dp_space_towrite[TXG_SIZE];
uint64_t dp_tempreserved[TXG_SIZE];
- enum scrub_func dp_scrub_func;
- uint64_t dp_scrub_queue_obj;
- uint64_t dp_scrub_min_txg;
- uint64_t dp_scrub_max_txg;
- zbookmark_t dp_scrub_bookmark;
- boolean_t dp_scrub_pausing;
- boolean_t dp_scrub_isresilver;
- uint64_t dp_scrub_start_time;
- kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
- boolean_t dp_scrub_restart;
-
/* Has its own locking */
tx_state_t dp_tx;
txg_list_t dp_dirty_datasets;
@@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
void dsl_pool_close(dsl_pool_t *dp);
dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_memory_pressure(dsl_pool_t *dp);
void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
- struct dmu_tx *tx);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+ const blkptr_t *bpp);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
index 26018a46d1b2..a636ad35096b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_PROP_H
@@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record {
void *cbr_arg;
} dsl_prop_cb_record_t;
+typedef struct dsl_props_arg {
+ nvlist_t *pa_props;
+ zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+ const char *psa_name;
+ zprop_source_t psa_source;
+ int psa_intsz;
+ int psa_numints;
+ const void *psa_value;
+
+ /*
+ * Used to handle the special requirements of the quota and reservation
+ * properties.
+ */
+ uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg);
int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
@@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname,
int intsz, int numints, void *buf, char *setpoint);
int dsl_prop_get_integer(const char *ddname, const char *propname,
uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
int intsz, int numints, void *buf, char *setpoint);
int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
- int intsz, int numints, void *buf, char *setpoint);
+ int intsz, int numints, void *buf, char *setpoint,
+ boolean_t snapshot);
dsl_syncfunc_t dsl_props_set_sync;
int dsl_prop_set(const char *ddname, const char *propname,
- int intsz, int numints, const void *buf);
-int dsl_props_set(const char *dsname, nvlist_t *nvl);
-void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx);
+ zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ dmu_tx_t *tx);
+
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+ zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) \
+ dsl_prop_check_prediction((dd), (psa))
+#else
+#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);
void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
void dsl_prop_nvlist_add_string(nvlist_t *nv,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 000000000000..c79666e67de0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SCAN_H
+#define _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+ uint64_t scn_func; /* pool_scan_func_t */
+ uint64_t scn_state; /* dsl_scan_state_t */
+ uint64_t scn_queue_obj;
+ uint64_t scn_min_txg;
+ uint64_t scn_max_txg;
+ uint64_t scn_cur_min_txg;
+ uint64_t scn_cur_max_txg;
+ uint64_t scn_start_time;
+ uint64_t scn_end_time;
+ uint64_t scn_to_examine; /* total bytes to be scanned */
+ uint64_t scn_examined; /* bytes scanned so far */
+ uint64_t scn_to_process;
+ uint64_t scn_processed;
+ uint64_t scn_errors; /* scan I/O error count */
+ uint64_t scn_ddt_class_max;
+ ddt_bookmark_t scn_ddt_bookmark;
+ zbookmark_t scn_bookmark;
+ uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+ DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+ struct dsl_pool *scn_dp;
+
+ boolean_t scn_pausing;
+ uint64_t scn_restart_txg;
+ uint64_t scn_sync_start_time;
+ zio_t *scn_zio_root;
+
+ /* for debugging / information */
+ uint64_t scn_visited_this_txg;
+
+ dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
index 4995bfe5acca..9126290cdb5b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
#define _SYS_DSL_SYNCTASK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/txg.h>
#include <sys/zfs_context.h>
@@ -38,7 +35,7 @@ extern "C" {
struct dsl_pool;
typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
typedef struct dsl_sync_task {
list_node_t dst_node;
@@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
txg_node_t dstg_node;
list_t dstg_tasks;
struct dsl_pool *dstg_pool;
- cred_t *dstg_cr;
uint64_t dstg_txg;
int dstg_err;
int dstg_space;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index c77b77205490..583d6303bd5a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -36,9 +35,6 @@
extern "C" {
#endif
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
extern space_map_ops_t *zfs_metaslab_ops;
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@@ -58,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+ space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+ int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
vdev_t *vd);
extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 5f0b77086b03..07988dd51a73 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -37,9 +37,14 @@ extern "C" {
#endif
struct metaslab_class {
+ spa_t *mc_spa;
metaslab_group_t *mc_rotor;
- uint64_t mc_allocated;
space_map_ops_t *mc_ops;
+ uint64_t mc_aliquot;
+ uint64_t mc_alloc; /* total allocated space */
+ uint64_t mc_deferred; /* total deferred frees */
+ uint64_t mc_space; /* total space (alloc + free) */
+ uint64_t mc_dspace; /* total deflated space */
};
struct metaslab_group {
@@ -48,6 +53,7 @@ struct metaslab_group {
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
int64_t mg_bias;
+ int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
metaslab_group_t *mg_prev;
@@ -67,7 +73,9 @@ struct metaslab {
space_map_obj_t ms_smo_syncing; /* syncing space map object */
space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */
space_map_t ms_map; /* in-core free space map */
+ int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
index e84b1bf65f99..37a28b8c14b9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
#define _SYS_REFCOUNT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cdefs.h>
#include <sys/types.h>
#include_next <sys/refcount.h>
@@ -45,7 +42,7 @@ extern "C" {
*/
#define FTAG ((char *)__func__)
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
typedef struct reference {
list_node_t ref_link;
void *ref_holder;
@@ -72,11 +69,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag);
int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_sysinit(void);
void refcount_fini(void);
-#else /* DEBUG */
+#else /* ZFS_DEBUG */
typedef struct refcount {
uint64_t rc_count;
@@ -93,11 +91,16 @@ typedef struct refcount {
atomic_add_64_nv(&(rc)->rc_count, number)
#define refcount_remove_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, -number)
+#define refcount_transfer(dst, src) { \
+ uint64_t __tmp = (src)->rc_count; \
+ atomic_add_64(&(src)->rc_count, -__tmp); \
+ atomic_add_64(&(dst)->rc_count, __tmp); \
+}
#define refcount_sysinit()
#define refcount_fini()
-#endif /* DEBUG */
+#endif /* ZFS_DEBUG */
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
new file mode 100644
index 000000000000..e12520105bc2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_H
+#define _SYS_SA_H
+
+#include <sys/dmu.h>
+#include <sys/uio.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+ char *sa_name; /* attribute name */
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap; /* bswap functon enum */
+ sa_attr_type_t sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+ boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+ void *sa_data;
+ sa_data_locator_t *sa_data_func;
+ uint16_t sa_length;
+ sa_attr_type_t sa_attr;
+ /* the following are private to the sa framework */
+ void *sa_addr;
+ uint16_t sa_buftype;
+ uint16_t sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+ b[idx].sa_attr = attr;\
+ b[idx].sa_data_func = func; \
+ b[idx].sa_data = data; \
+ b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+ SA_HDL_SHARED,
+ SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+ uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
new file mode 100644
index 000000000000..6661e47cfc83
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+ sa_attr_type_t sa_attr;
+ uint8_t sa_registered;
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap;
+ char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | unused | len | bswap | attr num |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16 0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ * ......
+ *
+ */
+
+#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define TOC_OFF(x) BF32_GET(x, 0, 23)
+#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
+#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
+#define TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+ BF32_SET(x, 31, 1, 1); \
+ BF32_SET(x, 24, 7, len_idx); \
+ BF32_SET(x, 0, 24, offset); \
+}
+
+#define SA_LAYOUTS "LAYOUTS"
+#define SA_REGISTRY "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+ avl_node_t lot_num_node;
+ avl_node_t lot_hash_node;
+ uint64_t lot_num;
+ uint64_t lot_hash;
+ sa_attr_type_t *lot_attrs; /* array of attr #'s */
+ uint32_t lot_var_sizes; /* how many aren't fixed size */
+ uint32_t lot_attr_count; /* total attr count */
+ list_t lot_idx_tab; /* should be only a couple of entries */
+ int lot_instance; /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+ list_node_t sa_next;
+ sa_lot_t *sa_layout;
+ uint16_t *sa_variable_lengths;
+ refcount_t sa_refcount;
+ uint32_t *sa_idx_tab; /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read. The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout. Both
+ * of these tree's are interconnected. The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+ kmutex_t sa_lock;
+ boolean_t sa_need_attr_registration;
+ boolean_t sa_force_spill;
+ uint64_t sa_master_obj;
+ uint64_t sa_reg_attr_obj;
+ uint64_t sa_layout_attr_obj;
+ int sa_num_attrs;
+ sa_attr_table_t *sa_attr_table; /* private attr table */
+ sa_update_cb_t *sa_update_cb;
+ avl_tree_t sa_layout_num_tree; /* keyed by layout number */
+ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
+ int sa_user_table_sz;
+ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+ SA_BONUS = 1,
+ SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+ SA_LOOKUP,
+ SA_UPDATE,
+ SA_ADD,
+ SA_REPLACE,
+ SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+ kmutex_t sa_lock;
+ dmu_buf_t *sa_bonus;
+ dmu_buf_t *sa_spill;
+ objset_t *sa_os;
+ void *sa_userp;
+ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
+ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
+};
+
+#define SA_GET_DB(hdl, type) \
+ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define SA_GET_HDR(hdl, type) \
+ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+ type))->db.db_data))
+
+#define SA_IDX_TAB_GET(hdl, type) \
+ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define IS_SA_BONUSTYPE(a) \
+ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define SA_BONUSTYPE_FROM_DB(db) \
+ (dmu_get_bonustype((dmu_buf_t *)db))
+
+#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define SA_LAYOUT_NUM(x, type) \
+ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+ SA_REGISTERED_LEN(sa, attr))
+
+#define SA_SET_HDR(hdr, num, size) \
+ { \
+ hdr->sa_magic = SA_MAGIC; \
+ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+ }
+
+#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+ { \
+ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+ bulk.sa_buftype = type; \
+ bulk.sa_addr = \
+ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+ (uintptr_t)hdr); \
+}
+
+#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+ sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+ uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+ uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index f54a5dc52f23..23d48c8a9e80 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -43,8 +42,13 @@ extern "C" {
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
struct dsl_pool;
/*
@@ -134,15 +138,15 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
+ * 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
+ * a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +170,29 @@ typedef struct zio_cksum {
* cksum checksum function
* comp compression function
* G gang block indicator
- * E endianness
- * type DMU object type
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
* lvl level of indirection
- * birth txg transaction group in which the block was born
+ * type DMU object type
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
/*
* Macros to get and set fields in a bp or DVA.
*/
@@ -209,7 +217,6 @@ typedef struct blkptr {
#define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
@@ -218,20 +225,35 @@ typedef struct blkptr {
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -239,7 +261,7 @@ typedef struct blkptr {
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@@ -255,6 +277,12 @@ typedef struct blkptr {
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +302,10 @@ typedef struct blkptr {
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \
{ \
@@ -287,14 +318,12 @@ typedef struct blkptr {
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
+ (bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
-#define BLK_FILL_ALREADY_FREED (-1ULL)
-
/*
* Note: the byteorder is either 0 or -1, both of which are palindromes.
* This simplifies the endianness handling a bit.
@@ -309,27 +338,88 @@ typedef struct blkptr {
#define BP_SPRINTF_LEN 320
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+{ \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int size = BP_SPRINTF_LEN; \
+ int len = 0; \
+ int copies = 0; \
+ \
+ if (bp == NULL) { \
+ len = func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len = func(buf + len, size - len, "<hole>"); \
+ } else { \
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)bp->blk_fill, \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+}
+
#include <sys/dmu.h>
#define BP_GET_BUFC_TYPE(bp) \
(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
+
+typedef enum spa_import_type {
+ SPA_IMPORT_EXISTING,
+ SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+ nvlist_t *policy, nvlist_t **config);
extern int spa_get_stats(const char *pool, nvlist_t **config,
char *altroot, size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
const char *history_str, nvlist_t *zplprops);
-extern int spa_check_rootconf(char *devpath, char *devid,
- nvlist_t **bestconf, uint64_t *besttxg);
-extern boolean_t spa_rootdev_validate(nvlist_t *nv);
extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+ uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -341,12 +431,23 @@ extern void spa_async_suspend(spa_t *spa);
extern void spa_async_resume(spa_t *spa);
extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define SPA_REMOVE_UNSPARE 0x01
+#define SPA_REMOVE_DONE 0x02
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -355,8 +456,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp);
/* spare state (which is global across all pools) */
extern void spa_spare_add(vdev_t *vd);
@@ -370,15 +474,23 @@ extern void spa_l2cache_remove(vdev_t *vd);
extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
extern void spa_l2cache_activate(vdev_t *vd);
extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
/* spa syncing */
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
extern void spa_sync_allpools(void);
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred. XXX so can't we change it back to 1?
+ */
+#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */
+#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
+#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
+
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
@@ -396,7 +508,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config);
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
int getstats);
extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
/*
* Miscellaneous SPA routines in spa_misc.c
@@ -404,7 +515,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
@@ -413,6 +524,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
+#define SCL_NONE 0x00
#define SCL_CONFIG 0x01
#define SCL_STATE 0x02
#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
@@ -432,12 +544,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+/* Log state */
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
/* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -449,36 +579,49 @@ extern char *spa_name(spa_t *spa);
extern uint64_t spa_guid(spa_t *spa);
extern uint64_t spa_last_synced_txg(spa_t *spa);
extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
extern int spa_busy(void);
extern uint8_t spa_get_failmode(spa_t *spa);
extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
boolean_t l2cache);
extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
+
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
#define strtonum(str, nptr) zfs_strtonum((str), (nptr))
@@ -491,10 +634,11 @@ typedef enum history_log_type {
} history_log_type_t;
typedef struct history_arg {
- const char *ha_history_str;
+ char *ha_history_str;
history_log_type_t ha_log_type;
history_internal_events_t ha_event;
- char ha_zone[MAXPATHLEN];
+ char *ha_zone;
+ uid_t ha_uid;
} history_arg_t;
extern char *spa_his_ievent_table[];
@@ -504,16 +648,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
char *his_buf);
extern int spa_history_log(spa_t *spa, const char *his_buf,
history_log_type_t what);
-void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+ spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
/* error handling */
struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
- struct zio *zio, uint64_t stateoroffset, uint64_t length);
+ zio_t *zio, uint64_t stateoroffset, uint64_t length);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
extern uint64_t spa_get_errlog_size(spa_t *spa);
extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
@@ -544,7 +689,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ sprintf_blkptr(__blkbuf, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
index b56073b97516..1d3622f5a108 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SPA_BOOT_H
#define _SYS_SPA_BOOT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#ifdef __cplusplus
@@ -36,7 +34,6 @@ extern "C" {
extern char *spa_get_bootprop(char *prop);
extern void spa_free_bootprop(char *prop);
-extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index ecb065c3f98c..a2f15d2863fa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -36,6 +35,7 @@
#include <sys/avl.h>
#include <sys/refcount.h>
#include <sys/bplist.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -78,13 +78,6 @@ typedef struct spa_config_dirent {
char *scd_path;
} spa_config_dirent_t;
-typedef enum spa_log_state {
- SPA_LOG_UNKNOWN = 0, /* unknown log state */
- SPA_LOG_MISSING, /* missing log(s) */
- SPA_LOG_CLEAR, /* clear the log(s) */
- SPA_LOG_GOOD, /* log(s) are good */
-} spa_log_state_t;
-
enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH,
@@ -93,6 +86,25 @@ enum zio_taskq_type {
ZIO_TASKQ_TYPES
};
+/*
+ * State machine for the zpool-pooname process. The states transitions
+ * are done as follows:
+ *
+ * From To Routine
+ * PROC_NONE -> PROC_CREATED spa_activate()
+ * PROC_CREATED -> PROC_ACTIVE spa_thread()
+ * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
+ * PROC_DEACTIVATE -> PROC_GONE spa_thread()
+ * PROC_GONE -> PROC_NONE spa_deactivate()
+ */
+typedef enum spa_proc_state {
+ SPA_PROC_NONE, /* spa_proc = &p0, no process created */
+ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
+ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
+ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
+ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -101,13 +113,15 @@ struct spa {
avl_node_t spa_avl; /* node in spa_namespace_avl */
nvlist_t *spa_config; /* last synced config */
nvlist_t *spa_config_syncing; /* currently syncing config */
+ nvlist_t *spa_config_splitting; /* config for splitting */
+ nvlist_t *spa_load_info; /* info and errors from load */
uint64_t spa_config_txg; /* txg of last config change */
int spa_sync_pass; /* iterate-to-convergence */
pool_state_t spa_state; /* pool state */
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
- boolean_t spa_load_verbatim; /* load the given config? */
+ uint64_t spa_import_flags; /* import specific flags */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
metaslab_class_t *spa_normal_class; /* normal data class */
@@ -115,6 +129,9 @@ struct spa {
uint64_t spa_first_txg; /* first txg after spa_open() */
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ uint64_t spa_load_max_txg; /* best initial ub_txg */
+ uint64_t spa_claim_max_txg; /* highest claimed birth txg */
+ timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
@@ -124,21 +141,24 @@ struct spa {
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
- uint64_t spa_sync_bplist_obj; /* object for deferred frees */
- bplist_t spa_sync_bplist; /* deferred-free bplist */
+ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
+ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
+ boolean_t spa_extreme_rewind; /* rewind past deferred frees */
+ uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
- uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
- uint64_t spa_scrub_errors; /* scrub I/O error count */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
uint8_t spa_scrub_finished; /* indicator to rotate logs */
uint8_t spa_scrub_started; /* started since last boot */
uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
+ uint64_t spa_scan_pass_start; /* start time per pass/reboot */
+ uint64_t spa_scan_pass_exam; /* examined bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
@@ -146,7 +166,14 @@ struct spa {
uint16_t spa_async_tasks; /* async task mask */
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
- boolean_t spa_last_open_failed; /* true if last open faled */
+ int spa_last_open_failed; /* error if last open failed */
+ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
+ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_txg; /* ub txg that loaded */
+ uint64_t spa_load_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_meta_errors; /* verify metadata err count */
+ uint64_t spa_load_data_errors; /* verify data err count */
+ uint64_t spa_verify_min_txg; /* start txg of verify scrub */
kmutex_t spa_errlog_lock; /* error log lock */
uint64_t spa_errlog_last; /* last error log object */
uint64_t spa_errlog_scrub; /* scrub error log object */
@@ -168,10 +195,27 @@ struct spa {
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
+ uint8_t spa_claiming; /* pool is doing zil_claim() */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */
+ uint64_t spa_autoexpand; /* lun expansion on/off */
+ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+ uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_ditto; /* dedup ditto threshold */
+ uint64_t spa_dedup_checksum; /* default dedup checksum */
+ uint64_t spa_dspace; /* dspace in normal class */
+ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
+ kmutex_t spa_proc_lock; /* protects spa_proc* */
+ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
+ spa_proc_state_t spa_proc_state; /* see definition */
+ struct proc *spa_proc; /* "zpool-poolname" process */
+ uint64_t spa_did; /* if procp != p0, did of t1 */
+ boolean_t spa_autoreplace; /* autoreplace set in open */
+ int spa_vdev_locks; /* locks grabbed */
+ uint64_t spa_creation_version; /* version at pool creation */
+ uint64_t spa_prev_software_version;
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
@@ -180,16 +224,13 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
refcount_t spa_refcount; /* number of opens */
+#ifndef sun
+ boolean_t spa_splitting_newspa; /* creating new spa in split */
+#endif
};
extern const char *spa_config_path;
-#define BOOTFS_COMPRESS_VALID(compress) \
- ((compress) == ZIO_COMPRESS_LZJB || \
- ((compress) == ZIO_COMPRESS_ON && \
- ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
- (compress) == ZIO_COMPRESS_OFF)
-
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
index 23bdff211b4a..e323d5efabb7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_TXG_H
#define _SYS_TXG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/zfs_context.h>
@@ -41,6 +39,9 @@ extern "C" {
#define TXG_INITIAL TXG_SIZE /* initial txg */
#define TXG_IDX (txg & TXG_MASK)
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define TXG_DEFER_SIZE 2
+
#define TXG_WAIT 1ULL
#define TXG_NOWAIT 2ULL
@@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp);
extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
extern void txg_rele_to_quiesce(txg_handle_t *txghp);
extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
/*
* Delay the caller by the specified number of ticks or until
@@ -117,6 +117,7 @@ extern void txg_list_create(txg_list_t *tl, size_t offset);
extern void txg_list_destroy(txg_list_t *tl);
extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index 7413c662b355..7b356eac1293 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,13 +37,13 @@ struct tx_cpu {
kmutex_t tc_lock;
kcondvar_t tc_cv[TXG_SIZE];
uint64_t tc_count[TXG_SIZE];
+ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[16];
};
typedef struct tx_state {
tx_cpu_t *tx_cpu; /* protects right to enter txg */
kmutex_t tx_sync_lock; /* protects tx_state_t */
- krwlock_t tx_suspend;
uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
@@ -64,6 +64,8 @@ typedef struct tx_state {
kthread_t *tx_sync_thread;
kthread_t *tx_quiesce_thread;
+
+ taskq_t *tx_commit_cb_taskq; /* commit callback taskq */
} tx_state_t;
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
index 93d936ae4b18..b5bb91573145 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,19 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_UBERBLOCK_H
#define _SYS_UBERBLOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/vdev.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#ifdef __cplusplus
extern "C" {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
index b49df8ae0ce3..6ab6aa3135a2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -33,11 +32,6 @@ extern "C" {
#endif
/*
- * For zdb use and debugging purposes only
- */
-extern uint64_t ub_max_txg;
-
-/*
* The uberblock version is incremented whenever an incompatible on-disk
* format change is made to the SPA, DMU, or ZAP.
*
@@ -57,6 +51,9 @@ struct uberblock {
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
+
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
};
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index 933255464261..941f234dc68f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -47,10 +46,11 @@ typedef enum vdev_dtl_type {
extern boolean_t zfs_nocacheflush;
extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
extern int vdev_validate(vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
@@ -69,26 +69,31 @@ extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
+
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
- boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
- int64_t alloc_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-extern int vdev_fault(spa_t *spa, uint64_t guid);
-extern int vdev_degrade(spa_t *spa, uint64_t guid);
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_state_t *);
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
@@ -119,8 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+typedef enum vdev_config_flag {
+ VDEV_CONFIG_SPARE = 1 << 0,
+ VDEV_CONFIG_L2CACHE = 1 << 1,
+ VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
- boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+ boolean_t getstats, vdev_config_flag_t flags);
/*
* Label routines
@@ -136,7 +148,8 @@ typedef enum {
VDEV_LABEL_REPLACE, /* replace an existing device */
VDEV_LABEL_SPARE, /* add a new hot spare */
VDEV_LABEL_REMOVE, /* remove an existing device */
- VDEV_LABEL_L2CACHE /* add an L2ARC cache device */
+ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
+ VDEV_LABEL_SPLIT /* generating new label for split-off dev */
} vdev_labeltype_t;
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 93e410250880..7efa3f3b13cf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -62,6 +61,8 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef int vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef void vdev_hold_func_t(vdev_t *vd);
+typedef void vdev_rele_func_t(vdev_t *vd);
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -70,6 +71,8 @@ typedef struct vdev_ops {
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
+ vdev_hold_func_t *vdev_op_hold;
+ vdev_rele_func_t *vdev_op_rele;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -112,19 +115,28 @@ struct vdev {
uint64_t vdev_id; /* child number in vdev parent */
uint64_t vdev_guid; /* unique ID for this vdev */
uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_orig_guid; /* orig. guid prior to remove */
uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
vdev_ops_t *vdev_ops; /* vdev operations */
spa_t *vdev_spa; /* spa for this vdev */
void *vdev_tsd; /* type-specific data */
+ vnode_t *vdev_name_vp; /* vnode for pathname */
+ vnode_t *vdev_devid_vp; /* vnode for devid */
vdev_t *vdev_top; /* top-level vdev */
vdev_t *vdev_parent; /* parent vdev */
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */
+ boolean_t vdev_expanding; /* expand the vdev? */
+ boolean_t vdev_reopening; /* reopen in progress? */
+ int vdev_open_error; /* error on last open */
+ kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
/*
* Top-level vdev state.
@@ -139,10 +151,12 @@ struct vdev {
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
+ uint64_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_ishole; /* is a hole in the namespace */
/*
* Leaf vdev state.
@@ -155,6 +169,7 @@ struct vdev {
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
+ uint64_t vdev_resilvering; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
@@ -166,6 +181,8 @@ struct vdev {
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
+ boolean_t vdev_splitting; /* split or repair in progress */
+ boolean_t vdev_delayed_close; /* delayed device close? */
uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
uint8_t vdev_detached; /* device detached? */
uint8_t vdev_cant_read; /* vdev is failing all reads */
@@ -176,6 +193,7 @@ struct vdev {
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
+ vdev_aux_t vdev_label_aux; /* on-disk aux state */
/*
* For DTrace to work in userland (libzpool) context, these fields must
@@ -189,6 +207,8 @@ struct vdev {
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
};
+#define VDEV_RAIDZ_MAXPARITY 3
+
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
@@ -204,8 +224,8 @@ struct vdev {
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
@@ -239,10 +259,14 @@ typedef struct vdev_label {
#define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2
#define VDEV_ALLOC_L2CACHE 3
+#define VDEV_ALLOC_ROOTPOOL 4
+#define VDEV_ALLOC_SPLIT 5
/*
* Allocate or free a vdev
*/
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
vdev_t *parent, uint_t id, int alloctype);
extern void vdev_free(vdev_t *vd);
@@ -259,7 +283,8 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -279,13 +304,15 @@ extern vdev_ops_t vdev_disk_ops;
#endif
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
/*
* Common size functions
*/
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
/*
* zdb uses this tunable, so it must be declared here to make lint happy.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
index ea3a0f632055..a1130bbbaaae 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_H
#define _SYS_ZAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* ZAP - ZFS Attribute Processor
*
@@ -87,9 +84,6 @@
extern "C" {
#endif
-#define ZAP_MAXNAMELEN 256
-#define ZAP_MAXVALUELEN 1024
-
/*
* The matchtype specifies which entry will be accessed.
* MT_EXACT: only find an exact match (non-normalized)
@@ -106,6 +100,18 @@ typedef enum matchtype
MT_FIRST
} matchtype_t;
+typedef enum zap_flags {
+ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+ ZAP_FLAG_HASH64 = 1 << 0,
+ /* Key is binary, not string (zap_add_uint64() can be used) */
+ ZAP_FLAG_UINT64_KEY = 1 << 1,
+ /*
+ * First word of key (which must be an array of uint64) is
+ * already randomly distributed.
+ */
+ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
/*
* Create a new zapobj with no attributes and return its object number.
* MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -123,6 +129,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
/*
* Create a new zapobj with no attributes from the given (unallocated)
@@ -185,6 +194,11 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
int add, uint64_t *towrite, uint64_t *tooverwrite);
@@ -195,9 +209,12 @@ int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
* If an attribute with the given name already exists, the call will
* fail and return EEXIST.
*/
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
/*
* Set the attribute with the given name to the given value. If an
@@ -209,6 +226,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
/*
* Get the length (in integers) and the integer size of the specified
@@ -219,6 +239,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers);
/*
* Remove the specified attribute.
@@ -229,6 +251,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx);
/*
* Returns (in *count) the number of attributes in the specified zap
@@ -236,7 +260,6 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
/*
* Returns (in name) the name of the entry whose (value & mask)
* (za_first_integer) is value, or ENOENT if not found. The string
@@ -253,6 +276,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj,
*/
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx);
+
/*
* Manipulate entries where the name + value are the "same" (the name is
* a stringified version of the value).
@@ -260,6 +291,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta. Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx);
struct zap;
struct zap_leaf;
@@ -269,6 +317,7 @@ typedef struct zap_cursor {
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
+ uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
} zap_cursor_t;
@@ -320,6 +369,11 @@ void zap_cursor_advance(zap_cursor_t *zc);
uint64_t zap_cursor_serialize(zap_cursor_t *zc);
/*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
+/*
* Initialize a zap cursor pointing to the position recorded by
* zap_cursor_serialize (in the "serialized" argument). You can also
* use a "serialized" argument of 0 to start at the beginning of the
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
index c86bb16de268..1dc322e02f6f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -40,13 +39,13 @@ extern int fzap_default_block_shift;
#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
-#define ZAP_MAXCD (uint32_t)(-1)
-#define ZAP_HASHBITS 28
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define ZAP_NEED_CD (-1U)
+
typedef struct mzap_ent_phys {
uint64_t mze_value;
uint32_t mze_cd;
@@ -67,9 +66,11 @@ typedef struct mzap_ent {
avl_node_t mze_node;
int mze_chunkid;
uint64_t mze_hash;
- mzap_ent_phys_t mze_phys;
+ uint32_t mze_cd; /* copy from mze_phys->mze_cd */
} mzap_ent_t;
+#define MZE_PHYS(zap, mze) \
+ (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
/*
* The (fat) zap is stored in one object. It is an array of
@@ -127,6 +128,7 @@ typedef struct zap_phys {
uint64_t zap_num_entries; /* number of entries */
uint64_t zap_salt; /* salt to stir into hash function */
uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
/*
* This structure is followed by padding, and then the embedded
* pointer table. The embedded pointer table takes up second
@@ -168,10 +170,13 @@ typedef struct zap {
typedef struct zap_name {
zap_t *zn_zap;
- const char *zn_name_orij;
+ int zn_key_intlen;
+ const void *zn_key_orig;
+ int zn_key_orig_numints;
+ const void *zn_key_norm;
+ int zn_key_norm_numints;
uint64_t zn_hash;
matchtype_t zn_matchtype;
- const char *zn_name_norm;
char zn_normbuf[ZAP_MAXNAMELEN];
} zap_name_t;
@@ -183,8 +188,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
void zap_unlockdir(zap_t *zap);
void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
@@ -193,6 +201,7 @@ int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp);
+void fzap_prefetch(zap_name_t *zn);
int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
uint64_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
@@ -209,7 +218,8 @@ void zap_put_leaf(struct zap_leaf *l);
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
index 14144e059e54..3a33636741d9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,20 +19,21 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_LEAF_H
#define _SYS_ZAP_LEAF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
+#include <sys/zap.h>
#ifdef __cplusplus
extern "C" {
#endif
struct zap;
+struct zap_name;
+struct zap_stats;
#define ZAP_LEAF_MAGIC 0x2AB1EAF
@@ -129,12 +130,12 @@ typedef struct zap_leaf_phys {
typedef union zap_leaf_chunk {
struct zap_leaf_entry {
uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
- uint8_t le_int_size; /* size of ints */
+ uint8_t le_value_intlen; /* size of value's ints */
uint16_t le_next; /* next entry in hash chain */
uint16_t le_name_chunk; /* first chunk of the name */
- uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_name_numints; /* ints in name (incl null) */
uint16_t le_value_chunk; /* first chunk of the value */
- uint16_t le_value_length; /* value length in ints */
+ uint16_t le_value_numints; /* value length in ints */
uint32_t le_cd; /* collision differentiator */
uint64_t le_hash; /* hash value of the name */
} l_entry;
@@ -177,7 +178,7 @@ typedef struct zap_entry_handle {
* value must equal zap_hash(name).
*/
extern int zap_leaf_lookup(zap_leaf_t *l,
- zap_name_t *zn, zap_entry_handle_t *zeh);
+ struct zap_name *zn, zap_entry_handle_t *zeh);
/*
* Return a handle to the entry with this hash+cd, or the entry with the
@@ -193,10 +194,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l,
* num_integers in the attribute.
*/
extern int zap_entry_read(const zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, void *buf);
+ uint8_t integer_size, uint64_t num_integers, void *buf);
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
- uint16_t buflen, char *buf);
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
/*
* Replace the value of an existing entry.
@@ -204,7 +205,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
* zap_entry_update may fail if it runs out of space (ENOSPC).
*/
extern int zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf);
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
/*
* Remove an entry.
@@ -216,17 +217,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh);
* belong in this leaf (according to its hash value). Fills in the
* entry handle on success. Returns 0 on success or ENOSPC on failure.
*/
-extern int zap_entry_create(zap_leaf_t *l,
- const char *name, uint64_t h, uint32_t cd,
- uint8_t integer_size, uint64_t num_integers, const void *buf,
- zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
/*
* Return true if there are additional entries with the same normalized
* form.
*/
extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
- zap_name_t *zn, const char *name, zap_t *zap);
+ struct zap_name *zn, const char *name, struct zap *zap);
/*
* Other stuff.
@@ -235,7 +235,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+ struct zap_stats *zs);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
index ea1509504305..d3c471a60903 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ACL_H
@@ -32,6 +31,7 @@
#include <sys/acl.h>
#include <sys/dmu.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
@@ -105,12 +105,18 @@ typedef struct zfs_acl_phys_v0 {
#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
+
typedef struct zfs_acl_phys {
uint64_t z_acl_extern_obj; /* ext acl pieces */
uint32_t z_acl_size; /* Number of bytes in ACL */
uint16_t z_acl_version; /* acl version */
uint16_t z_acl_count; /* ace count */
- uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
} zfs_acl_phys_t;
typedef struct acl_ops {
@@ -145,21 +151,26 @@ typedef struct zfs_acl_node {
void *z_allocdata; /* pointer to kmem allocated memory */
size_t z_allocsize; /* Size of blob in bytes */
size_t z_size; /* length of ACL data */
- int z_ace_count; /* number of ACEs in this acl node */
+ uint64_t z_ace_count; /* number of ACEs in this acl node */
int z_ace_idx; /* ace iterator positioned on */
} zfs_acl_node_t;
typedef struct zfs_acl {
- int z_acl_count; /* Number of ACEs */
+ uint64_t z_acl_count; /* Number of ACEs */
size_t z_acl_bytes; /* Number of bytes in ACL */
uint_t z_version; /* version of ACL */
void *z_next_ace; /* pointer to next ACE */
- int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
+ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
list_t z_acl; /* chunks of ACE data */
acl_ops_t z_ops; /* ACL operations */
} zfs_acl_t;
+typedef struct acl_locator_cb {
+ zfs_acl_t *cb_aclp;
+ zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
#define ACL_DATA_ALLOCED 0x1
#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
@@ -206,7 +217,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr);
@@ -214,11 +225,20 @@ void zfs_acl_free(zfs_acl_t *);
int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
struct zfs_fuid_info **, zfs_acl_t **);
int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+ uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
#endif
#ifdef __cplusplus
}
#endif
-
-#endif /* !ZFS_NO_ACL */
+#endif /* _SYS_FS_ZFS_ACL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 952bb24a4567..6dc163d6f530 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZFS_CONTEXT_H
#define _SYS_ZFS_CONTEXT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -54,6 +52,8 @@ extern "C" {
#include <sys/byteorder.h>
#include <sys/systm.h>
#include <sys/list.h>
+#include <sys/zfs_debug.h>
+#include <sys/sysevent.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/time.h>
@@ -83,10 +83,11 @@ extern "C" {
#include <sys/misc.h>
#include <sys/sig.h>
#include <sys/osd.h>
-#include <sys/zfs_debug.h>
+#include <sys/sysevent/dev.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/u8_textprep.h>
#include <sys/fm/util.h>
+#include <sys/sunddi.h>
#include <machine/stdarg.h>
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
index 450ac1c81b42..50ecf9b36249 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
#define _SYS_ZFS_DEBUG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func,
extern void zfs_panic_recover(const char *fmt, ...);
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index bd2c938515ff..349f8ef37321 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FS_ZFS_DIR_H
#define _SYS_FS_ZFS_DIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/pathname.h>
#include <sys/dmu.h>
#include <sys/zfs_znode.h>
@@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
pathname_t *);
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
- uint_t, znode_t **, int, zfs_acl_ids_t *);
+ uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
index c035707c62a6..b381bb98e734 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/zfs_vfsops.h>
#endif
#include <sys/avl.h>
+#include <sys/list.h>
#ifdef __cplusplus
extern "C" {
@@ -100,6 +101,8 @@ typedef struct zfs_fuid_info {
#ifdef _KERNEL
struct znode;
extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+ uint64_t, uint64_t, zfs_fuid_type_t);
extern void zfs_fuid_destroy(zfsvfs_t *);
extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
cred_t *, zfs_fuid_info_t **);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index bf107d605fac..63b9c57eb8a9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -19,19 +19,18 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
#define _SYS_ZFS_IOCTL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cred.h>
#include <sys/dmu.h>
#include <sys/zio.h>
#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zfs_stat.h>
#ifdef _KERNEL
#include <sys/nvpair.h>
@@ -47,26 +46,86 @@ extern "C" {
#define ZFS_SNAPDIR_HIDDEN 0
#define ZFS_SNAPDIR_VISIBLE 1
-#define DMU_BACKUP_STREAM_VERSION (1ULL)
-#define DMU_BACKUP_HEADER_VERSION (2ULL)
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+ DMU_SUBSTREAM = 0x1,
+ DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2)
+#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x)
+
+#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30)
+#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define DMU_BACKUP_FEATURE_DEDUP (0x1)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
+#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
+
+/*
+ * Mask of all supported backup features
+ */
+#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+
+/* Are all features in the given flag word currently supported? */
+#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | reserved | feature-flags |C|S|
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2). Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2. Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
/*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define DRR_CHECKSUM_DEDUP (1<<0)
+
+#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+
+/*
* zfs ioctl command structure
*/
typedef struct dmu_replay_record {
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
- DRR_WRITE, DRR_FREE, DRR_END,
+ DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+ DRR_SPILL, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
struct drr_begin {
uint64_t drr_magic;
- uint64_t drr_version;
+ uint64_t drr_versioninfo; /* was drr_version */
uint64_t drr_creation_time;
dmu_objset_type_t drr_type;
uint32_t drr_flags;
@@ -76,6 +135,7 @@ typedef struct dmu_replay_record {
} drr_begin;
struct drr_end {
zio_cksum_t drr_checksum;
+ uint64_t drr_toguid;
} drr_end;
struct drr_object {
uint64_t drr_object;
@@ -83,14 +143,16 @@ typedef struct dmu_replay_record {
dmu_object_type_t drr_bonustype;
uint32_t drr_blksz;
uint32_t drr_bonuslen;
- uint8_t drr_checksum;
+ uint8_t drr_checksumtype;
uint8_t drr_compress;
uint8_t drr_pad[6];
+ uint64_t drr_toguid;
/* bonus content follows */
} drr_object;
struct drr_freeobjects {
uint64_t drr_firstobj;
uint64_t drr_numobjs;
+ uint64_t drr_toguid;
} drr_freeobjects;
struct drr_write {
uint64_t drr_object;
@@ -98,16 +160,61 @@ typedef struct dmu_replay_record {
uint32_t drr_pad;
uint64_t drr_offset;
uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
/* content follows */
} drr_write;
struct drr_free {
uint64_t drr_object;
uint64_t drr_offset;
uint64_t drr_length;
+ uint64_t drr_toguid;
} drr_free;
+ struct drr_write_byref {
+ /* where to put the data */
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ /* where to find the prior copy of the data */
+ uint64_t drr_refguid;
+ uint64_t drr_refobject;
+ uint64_t drr_refoffset;
+ /* properties of the data */
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
+ } drr_write_byref;
+ struct drr_spill {
+ uint64_t drr_object;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint64_t drr_pad[4]; /* needed for crypto */
+ /* spill data follows */
+ } drr_spill;
} drr_u;
} dmu_replay_record_t;
+/* diff record range types */
+typedef enum diff_type {
+ DDR_NONE = 0x1,
+ DDR_INUSE = 0x2,
+ DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+ uint64_t ddr_type;
+ uint64_t ddr_first;
+ uint64_t ddr_last;
+} dmu_diff_record_t;
+
typedef struct zinject_record {
uint64_t zi_objset;
uint64_t zi_object;
@@ -119,6 +226,10 @@ typedef struct zinject_record {
uint64_t zi_type;
uint32_t zi_freq;
uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
} zinject_record_t;
#define ZINJECT_NULL 0x1
@@ -146,8 +257,9 @@ typedef enum zfs_case {
typedef struct zfs_cmd {
char zc_name[MAXPATHLEN];
- char zc_value[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
char zc_string[MAXNAMELEN];
+ char zc_top_ds[MAXPATHLEN];
uint64_t zc_guid;
uint64_t zc_nvlist_conf; /* really (char *) */
uint64_t zc_nvlist_conf_size;
@@ -162,11 +274,21 @@ typedef struct zfs_cmd {
uint64_t zc_history_len;
uint64_t zc_history_offset;
uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
zfs_share_t zc_share;
uint64_t zc_jailid;
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_pad[4]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
} zfs_cmd_t;
typedef struct zfs_useracct {
@@ -176,8 +298,10 @@ typedef struct zfs_useracct {
uint64_t zu_space;
} zfs_useracct_t;
-#define ZVOL_MAX_MINOR (1 << 16)
-#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
+#define ZFSDEV_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
+
+#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
#ifdef _KERNEL
@@ -191,7 +315,29 @@ extern int zfs_secpolicy_rename_perms(const char *from,
const char *to, cred_t *cr);
extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
+extern int zfs_unmount_snap(const char *, void *);
+
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+ ZSST_ZVOL,
+ ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+ enum zfs_soft_state_type zss_type;
+ void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+ enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+extern kmutex_t zfsdev_state_lock;
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
new file mode 100644
index 000000000000..4982bd4d0afc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_ONEXIT_H
+#define _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+ kmutex_t zo_lock;
+ list_t zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+ list_node_t za_link;
+ void (*za_func)(void *);
+ void *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+ boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+ void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
new file mode 100644
index 000000000000..fc40b0e9517c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_SA_H
+#define _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL. The values of the actual
+ * attributes are not defined by the order
+ * the enums. It is controlled by the attribute
+ * registration mechanism. Two different file system
+ * could have different numeric values for the same
+ * attributes. this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_END
+} zpl_attr_t;
+
+#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
+ sizeof (zfs_acl_phys_t))
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_zap; /* 144 - extra attributes */
+ uint64_t zp_pad[3]; /* 152 - future */
+ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we use this space for the following:
+ * - symbolic links
+ * - 32-byte anti-virus scanstamp (regular files only)
+ */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
new file mode 100644
index 000000000000..a8af7ec61ba9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_STAT_H
+#define _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl. zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+ uint64_t zs_gen;
+ uint64_t zs_mode;
+ uint64_t zs_links;
+ uint64_t zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 163a8000248b..c328a03b3ecf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_VFSOPS_H
@@ -29,6 +28,7 @@
#include <sys/list.h>
#include <sys/vfs.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#include <sys/rrwlock.h>
#include <sys/zfs_ioctl.h>
@@ -37,6 +37,7 @@ extern "C" {
#endif
typedef struct zfsvfs zfsvfs_t;
+struct znode;
struct zfsvfs {
vfs_t *z_vfs; /* generic fs struct */
@@ -54,7 +55,6 @@ struct zfsvfs {
boolean_t z_fuid_dirty; /* need to sync fuid table ? */
struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
zilog_t *z_log; /* intent log pointer */
- uint_t z_acl_mode; /* acl chmod/mode behavior */
uint_t z_acl_inherit; /* acl inheritance behavior */
zfs_case_t z_case; /* case-sense */
boolean_t z_utf8; /* utf8-only */
@@ -71,12 +71,14 @@ struct zfsvfs {
boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
- kmutex_t z_online_recv_lock; /* held while recv in progress */
+ boolean_t z_use_sa; /* version allow system attributes */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
+ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
};
@@ -132,19 +134,23 @@ typedef struct zfid_long {
extern uint_t zfs_fsyncer_key;
extern int zfs_super_owner;
-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valuep);
extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
- boolean_t isgroup, uint64_t fuid);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+ boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+ uint64_t fuid);
extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
-extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
+extern int zfs_vnode_lock(vnode_t *vp, int flags);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 6f0a43636010..d3955d7eee7b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ZNODE_H
@@ -29,8 +28,11 @@
#ifdef _KERNEL
#include <sys/list.h>
#include <sys/dmu.h>
+#include <sys/sa.h>
#include <sys/zfs_vfsops.h>
#include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -54,13 +56,18 @@ extern "C" {
#define ZFS_OPAQUE 0x0000010000000000
#define ZFS_AV_QUARANTINED 0x0000020000000000
#define ZFS_AV_MODIFIED 0x0000040000000000
+#define ZFS_REPARSE 0x0000080000000000
+#define ZFS_OFFLINE 0x0000100000000000
+#define ZFS_SPARSE 0x0000200000000000
-#define ZFS_ATTR_SET(zp, attr, value) \
+#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
{ \
if (value) \
- zp->z_phys->zp_flags |= attr; \
+ pflags |= attr; \
else \
- zp->z_phys->zp_flags &= ~attr; \
+ pflags &= ~attr; \
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+ &pflags, sizeof (pflags), tx)); \
}
/*
@@ -76,25 +83,46 @@ extern "C" {
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
+#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
+#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
+#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
+#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
+#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
+#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
+#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
+#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
+#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
+#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
+#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
+#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
+#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
+#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
+#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
+#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
+#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
+#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
+#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
+
/*
* Is ID ephemeral?
*/
-#ifdef TODO
#define IS_EPHEMERAL(x) (x > MAXUID)
-#else
-#define IS_EPHEMERAL(x) (0)
-#endif
/*
* Should we use FUIDs?
*/
-#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\
+#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
#define MASTER_NODE_OBJ 1
/*
* Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
*/
#define ZFS_FSID "FSID"
#define ZFS_UNLINKED_SET "DELETE_QUEUE"
@@ -102,6 +130,7 @@ extern "C" {
#define ZPL_VERSION_STR "VERSION"
#define ZFS_FUID_TABLES "FUID"
#define ZFS_SHARES_DIR "SHARES"
+#define ZFS_SA_ATTRS "SA_ATTRS"
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
@@ -132,42 +161,6 @@ extern "C" {
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
/*
- * This is the persistent portion of the znode. It is stored
- * in the "bonus buffer" of the file. Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
- uint64_t zp_atime[2]; /* 0 - last file access time */
- uint64_t zp_mtime[2]; /* 16 - last file modification time */
- uint64_t zp_ctime[2]; /* 32 - last file change time */
- uint64_t zp_crtime[2]; /* 48 - creation time */
- uint64_t zp_gen; /* 64 - generation (txg of creation) */
- uint64_t zp_mode; /* 72 - file mode bits */
- uint64_t zp_size; /* 80 - size of file */
- uint64_t zp_parent; /* 88 - directory parent (`..') */
- uint64_t zp_links; /* 96 - number of links to file */
- uint64_t zp_xattr; /* 104 - DMU object for xattrs */
- uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
- uint64_t zp_flags; /* 120 - persistent flags */
- uint64_t zp_uid; /* 128 - file owner */
- uint64_t zp_gid; /* 136 - owning group */
- uint64_t zp_zap; /* 144 - extra attributes */
- uint64_t zp_pad[3]; /* 152 - future */
- zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
- /*
- * Data may pad out any remaining bytes in the znode buffer, eg:
- *
- * |<---------------------- dnode_phys (512) ------------------------>|
- * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
- * |<---- znode (264) ---->|<---- data (56) ---->|
- *
- * At present, we use this space for the following:
- * - symbolic links
- * - 32-byte anti-virus scanstamp (regular files only)
- */
-} znode_phys_t;
-
-/*
* Directory entry locks control access to directory entries.
* They are used to protect creates, deletes, and renames.
* Each directory znode has a mutex and a list of locked names.
@@ -196,20 +189,24 @@ typedef struct znode {
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
- uint64_t z_last_itx; /* last ZIL itx on this znode */
- uint64_t z_gen; /* generation (same as zp_gen) */
+ uint64_t z_gen; /* generation (cached) */
+ uint64_t z_size; /* file size (cached) */
+ uint64_t z_atime[2]; /* atime (cached) */
+ uint64_t z_links; /* file links (cached) */
+ uint64_t z_pflags; /* pflags (cached) */
+ uint64_t z_uid; /* uid fuid (cached) */
+ uint64_t z_gid; /* gid fuid (cached) */
+ mode_t z_mode; /* mode (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
list_node_t z_link_node; /* all znodes in fs link */
- /*
- * These are dmu managed fields.
- */
- znode_phys_t *z_phys; /* pointer to persistent znode */
- dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+ sa_handle_t *z_sa_hdl; /* handle to sa data */
+ boolean_t z_is_sa; /* are we native sa? */
/* FreeBSD-specific field. */
struct task z_task;
} znode_t;
@@ -277,7 +274,7 @@ VTOZ(vnode_t *vp)
#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
#define ZFS_VERIFY_ZP(zp) \
- if ((zp)->z_dbuf == NULL) { \
+ if ((zp)->z_sa_hdl == NULL) { \
ZFS_EXIT((zp)->z_zfsvfs); \
return (EIO); \
} \
@@ -319,14 +316,14 @@ VTOZ(vnode_t *vp)
#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
- zfs_time_stamper(zp, ACCESSED, NULL)
+ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
extern int zfs_init_fs(zfsvfs_t *, znode_t **);
extern void zfs_set_dataprop(objset_t *);
extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
dmu_tx_t *tx);
-extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+ uint64_t [2], boolean_t);
extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
@@ -349,7 +346,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
vattr_t *vap);
extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name);
+ znode_t *dzp, char *name, uint64_t foid);
+#define ZFS_NO_OBJECT 0 /* no object id */
extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name);
extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -366,7 +364,7 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
#endif
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index efbf65e287ee..a4c5575b2dba 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_ZIL_H
#define _SYS_ZIL_H
@@ -55,34 +56,40 @@ typedef struct zil_header {
uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */
- uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
uint64_t zh_flags; /* header flags */
- uint64_t zh_pad[4];
+ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+ uint64_t zh_pad[3];
} zil_header_t;
/*
* zh_flags bit settings
*/
-#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
/*
- * Log block trailer - structure at the end of the header and each log block
+ * Log block chaining.
+ *
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
*
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * The zio_eck_t contains a zec_cksum which for the intent log is
* the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
* number passed in the blk_cksum field of the blkptr_t
*/
-typedef struct zil_trailer {
- uint64_t zit_pad;
- blkptr_t zit_next_blk; /* next block in chain */
- uint64_t zit_nused; /* bytes in log block used */
- zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+ uint64_t zc_pad;
+ blkptr_t zc_next_blk; /* next block in chain */
+ uint64_t zc_nused; /* bytes in log block used */
+ zio_eck_t zc_eck; /* block trailer */
+} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
-#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
/*
* The words of a log block checksum.
@@ -150,16 +157,26 @@ typedef enum zil_create {
#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order. For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define TX_OOO(txtype) \
+ ((txtype) == TX_WRITE || \
+ (txtype) == TX_TRUNCATE || \
+ (txtype) == TX_SETATTR || \
+ (txtype) == TX_ACL_V0 || \
+ (txtype) == TX_ACL || \
+ (txtype) == TX_WRITE2)
+
+/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
* Each log record has a common structure at the beginning.
*
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number. The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record. The two sequence numbers are
- * different because the transactions can now be pushed out of order.
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
*/
typedef struct { /* common log record header */
uint64_t lrc_txtype; /* intent log transaction type */
@@ -169,6 +186,14 @@ typedef struct { /* common log record header */
} lr_t;
/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id */
+} lr_ooo_t;
+
+/*
* Handle option extended vattr attributes.
*
* Whenever new attributes are added the version number
@@ -258,7 +283,7 @@ typedef struct {
uint64_t lr_foid; /* file object to write */
uint64_t lr_offset; /* offset to write to */
uint64_t lr_length; /* user data length to write */
- uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ uint64_t lr_blkoff; /* no longer used */
blkptr_t lr_blkptr; /* spa block pointer for replay */
/* write data will follow for small writes */
} lr_write_t;
@@ -306,13 +331,34 @@ typedef struct {
*/
/*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
*/
typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
/* and put blkptr in log, rather than actual data) */
WR_COPIED, /* immediate - data is copied into lr_write_t */
WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+ WR_NUM_STATES /* number of states */
} itx_wr_state_t;
typedef struct itx {
@@ -321,30 +367,19 @@ typedef struct itx {
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
uint64_t itx_sod; /* record size on disk */
+ uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
- zilog_t *zgd_zilog; /* zilog */
- blkptr_t *zgd_bp; /* block pointer */
- struct rl *zgd_rl; /* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t();
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
-extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
extern void zil_init(void);
@@ -358,28 +393,33 @@ extern void zil_close(zilog_t *zilog);
extern void zil_replay(objset_t *os, void *arg,
zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void zil_itx_destroy(itx_t *itx);
+extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern void zil_commit(zilog_t *zilog, uint64_t oid);
-extern int zil_vdev_offline(char *osname, void *txarg);
-extern int zil_claim(char *osname, void *txarg);
-extern int zil_check_log_chain(char *osname, void *txarg);
+extern int zil_vdev_offline(const char *osname, void *txarg);
+extern int zil_claim(const char *osname, void *txarg);
+extern int zil_check_log_chain(const char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void zil_clean(zilog_t *zilog);
-extern int zil_is_committed(zilog_t *zilog);
+extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
-extern void zil_add_block(zilog_t *zilog, blkptr_t *bp);
-extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
+extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
+extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
-extern int zil_disable;
+extern int zil_replay_disable;
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
index 3f2582931d15..1d4c0cc6c1de 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_ZIL_IMPL_H
#define _SYS_ZIL_IMPL_H
@@ -43,12 +44,34 @@ typedef struct lwb {
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
zio_t *lwb_zio; /* zio for this buffer */
+ dmu_tx_t *lwb_tx; /* tx for log block allocation */
uint64_t lwb_max_txg; /* highest txg in this lwb */
- txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
} lwb_t;
/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+ list_t i_sync_list; /* list of synchronous itxs */
+ avl_tree_t i_async_tree; /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+ kmutex_t itxg_lock; /* lock for this structure */
+ uint64_t itxg_txg; /* txg for this chain */
+ uint64_t itxg_sod; /* total size on disk for this txg */
+ itxs_t *itxg_itxs; /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+ uint64_t ia_foid; /* file object id */
+ list_t ia_list; /* list of async itxs for this foid */
+ avl_node_t ia_node; /* AVL tree linkage */
+} itx_async_node_t;
+
+/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
@@ -57,6 +80,8 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
+#define ZIL_PREV_BLKS 16
+
/*
* Stable storage intent log management structure. One per dataset.
*/
@@ -68,9 +93,8 @@ struct zilog {
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* next itx sequence number */
- uint64_t zl_commit_seq; /* committed upto this number */
- uint64_t zl_lr_seq; /* log record sequence number */
+ uint64_t zl_lr_seq; /* on-disk log record sequence number */
+ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
uint64_t zl_replaying_seq; /* current replay seq number */
@@ -82,24 +106,39 @@ struct zilog {
uint8_t zl_replay; /* replaying records while set */
uint8_t zl_stop_sync; /* for debugging */
uint8_t zl_writer; /* boolean: write setup in progress */
- uint8_t zl_log_error; /* boolean: log write error */
- list_t zl_itx_list; /* in-memory itx list */
+ uint8_t zl_logbias; /* latency or throughput */
+ uint8_t zl_sync; /* synchronous or asynchronous */
+ int zl_parse_error; /* last zil_parse() error */
+ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
+ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
+ uint64_t zl_parse_blk_count; /* number of blocks parsed */
+ uint64_t zl_parse_lr_count; /* number of log records parsed */
+ uint64_t zl_next_batch; /* next batch number */
+ uint64_t zl_com_batch; /* committed batch number */
+ kcondvar_t zl_cv_batch[2]; /* batch condition variables */
+ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
+ list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
- uint64_t zl_prev_used; /* previous commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
- avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
+ zil_header_t zl_old_header; /* debugging aid */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
};
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
dva_t zn_dva;
avl_node_t zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
+
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+ sizeof (lr_write_t))
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 049c12202282..355f560f0fc7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZIO_H
@@ -38,12 +37,15 @@
extern "C" {
#endif
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
/*
* Gang block headers are self-checksumming and contain an array
@@ -51,16 +53,16 @@ typedef struct zio_block_tail {
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
+ sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
+ zio_eck_t zg_tail;
} zio_gbh_phys_t;
enum zio_checksum {
@@ -73,12 +75,19 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_FUNCTIONS
};
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+#define ZIO_CHECKSUM_MASK 0xffULL
+#define ZIO_CHECKSUM_VERIFY (1 << 8)
+
+#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
+#define ZIO_DEDUPDITTO_MIN 100
+
enum zio_compress {
ZIO_COMPRESS_INHERIT = 0,
ZIO_COMPRESS_ON,
@@ -94,12 +103,19 @@ enum zio_compress {
ZIO_COMPRESS_GZIP_7,
ZIO_COMPRESS_GZIP_8,
ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
ZIO_COMPRESS_FUNCTIONS
};
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+#define BOOTFS_COMPRESS_VALID(compress) \
+ ((compress) == ZIO_COMPRESS_LZJB || \
+ ((compress) == ZIO_COMPRESS_ON && \
+ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+ (compress) == ZIO_COMPRESS_OFF)
+
#define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
@@ -115,73 +131,81 @@ enum zio_compress {
#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
-#define ZIO_PRIORITY_TABLE_SIZE 11
-
-#define ZIO_FLAG_MUSTSUCCEED 0x000000
-#define ZIO_FLAG_CANFAIL 0x000001
-#define ZIO_FLAG_SPECULATIVE 0x000002
-#define ZIO_FLAG_CONFIG_WRITER 0x000004
-#define ZIO_FLAG_DONT_RETRY 0x000008
-
-#define ZIO_FLAG_DONT_CACHE 0x000010
-#define ZIO_FLAG_DONT_QUEUE 0x000020
-#define ZIO_FLAG_DONT_AGGREGATE 0x000040
-#define ZIO_FLAG_DONT_PROPAGATE 0x000080
-
-#define ZIO_FLAG_IO_BYPASS 0x000100
-#define ZIO_FLAG_IO_REPAIR 0x000200
-#define ZIO_FLAG_IO_RETRY 0x000400
-#define ZIO_FLAG_IO_REWRITE 0x000800
-
-#define ZIO_FLAG_SELF_HEAL 0x001000
-#define ZIO_FLAG_RESILVER 0x002000
-#define ZIO_FLAG_SCRUB 0x004000
-#define ZIO_FLAG_SCRUB_THREAD 0x008000
-
-#define ZIO_FLAG_PROBE 0x010000
-#define ZIO_FLAG_GANG_CHILD 0x020000
-#define ZIO_FLAG_RAW 0x040000
-#define ZIO_FLAG_GODFATHER 0x080000
-
-#define ZIO_FLAG_TRYHARD 0x100000
-
-#define ZIO_FLAG_GANG_INHERIT \
- (ZIO_FLAG_CANFAIL | \
- ZIO_FLAG_SPECULATIVE | \
- ZIO_FLAG_CONFIG_WRITER | \
- ZIO_FLAG_DONT_RETRY | \
- ZIO_FLAG_DONT_CACHE | \
- ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
-
-#define ZIO_FLAG_VDEV_INHERIT \
- (ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_IO_RETRY | \
- ZIO_FLAG_PROBE | \
- ZIO_FLAG_TRYHARD)
-
-#define ZIO_FLAG_AGG_INHERIT \
- (ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
+#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
+#define ZIO_PRIORITY_TABLE_SIZE 12
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
+enum zio_flag {
+ /*
+ * Flags inherited by gang, ddt, and vdev children,
+ * and that must be equal for two zios to aggregate
+ */
+ ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+ ZIO_FLAG_IO_REPAIR = 1 << 1,
+ ZIO_FLAG_SELF_HEAL = 1 << 2,
+ ZIO_FLAG_RESILVER = 1 << 3,
+ ZIO_FLAG_SCRUB = 1 << 4,
+ ZIO_FLAG_SCAN_THREAD = 1 << 5,
+
+#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
+
+ /*
+ * Flags inherited by ddt, gang, and vdev children.
+ */
+ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 7,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 8,
+ ZIO_FLAG_DONT_RETRY = 1 << 9,
+ ZIO_FLAG_DONT_CACHE = 1 << 10,
+ ZIO_FLAG_NODATA = 1 << 11,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+
+#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+
+ /*
+ * Flags inherited by vdev children.
+ */
+ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 14,
+ ZIO_FLAG_TRYHARD = 1 << 15,
+ ZIO_FLAG_OPTIONAL = 1 << 16,
+
+#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
+
+ /*
+ * Flags not inherited by any children.
+ */
+ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
+ ZIO_FLAG_IO_BYPASS = 1 << 19,
+ ZIO_FLAG_IO_REWRITE = 1 << 20,
+ ZIO_FLAG_RAW = 1 << 21,
+ ZIO_FLAG_GANG_CHILD = 1 << 22,
+ ZIO_FLAG_DDT_CHILD = 1 << 23,
+ ZIO_FLAG_GODFATHER = 1 << 24
+};
+
+#define ZIO_FLAG_MUSTSUCCEED 0
+
+#define ZIO_DDT_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
+ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
#define ZIO_GANG_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+#define ZIO_VDEV_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
+ ZIO_FLAG_CANFAIL)
+
enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
+ ZIO_CHILD_DDT,
ZIO_CHILD_LOGICAL,
ZIO_CHILD_TYPES
};
@@ -193,13 +217,13 @@ enum zio_wait_type {
};
/*
- * We'll take the EILSEQ and ENOMSG to indicate checksum errors and
- * fragmentation.
+ * We'll take the number 122 and 123 to indicate checksum errors and
+ * fragmentation. Those doesn't collide with any errno values as they
+ * are greater than ELAST.
*/
-#define ECKSUM EILSEQ
-#define EFRAGS ENOMSG
+#define ECKSUM 122
+#define EFRAGS 123
-typedef struct zio zio_t;
typedef void zio_done_func_t(zio_t *zio);
extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -208,18 +232,15 @@ extern char *zio_type_name[ZIO_TYPES];
/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
* identifies any block in the pool. By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number. In summary:
+ * is objset 0, and the meta-dnode is object 0. This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
*
- * mos is objset 0
- * meta-dnode is object 0
- * root block is <objset, 0, -1, 0>
- * intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
*
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse. The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
*
* Note: this structure is passed between userland and the kernel.
* Therefore it must not change size or alignment between 32/64 bit
@@ -232,14 +253,66 @@ typedef struct zbookmark {
uint64_t zb_blkid;
} zbookmark_t;
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define ZB_DESTROYED_OBJSET (-1ULL)
+
+#define ZB_ROOT_OBJECT (0ULL)
+#define ZB_ROOT_LEVEL (-1LL)
+#define ZB_ROOT_BLKID (0ULL)
+
+#define ZB_ZIL_OBJECT (0ULL)
+#define ZB_ZIL_LEVEL (-2LL)
+
typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
dmu_object_type_t zp_type;
uint8_t zp_level;
- uint8_t zp_ndvas;
+ uint8_t zp_copies;
+ uint8_t zp_dedup;
+ uint8_t zp_dedup_verify;
} zio_prop_t;
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+ const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum; /* defined in zio_checksum.h */
+
+struct zio_cksum_report {
+ struct zio_cksum_report *zcr_next;
+ nvlist_t *zcr_ereport;
+ nvlist_t *zcr_detector;
+ void *zcr_cbdata;
+ size_t zcr_cbinfo; /* passed to zcr_free() */
+ uint64_t zcr_align;
+ uint64_t zcr_length;
+ zio_cksum_finish_f *zcr_finish;
+ zio_cksum_free_f *zcr_free;
+
+ /* internal use only */
+ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+ void *arg);
+
+zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+ zio_done_func_t *vsd_free;
+ zio_vsd_cksum_report_f *vsd_cksum_report;
+} zio_vsd_ops_t;
+
typedef struct zio_gang_node {
zio_gbh_phys_t *gn_gbh;
struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
@@ -290,6 +363,7 @@ struct zio {
uint64_t io_txg;
spa_t *io_spa;
blkptr_t *io_bp;
+ blkptr_t *io_bp_override;
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
@@ -301,16 +375,20 @@ struct zio {
zio_done_func_t *io_ready;
zio_done_func_t *io_done;
void *io_private;
+ int64_t io_prev_space_delta; /* DMU private */
blkptr_t io_bp_orig;
/* Data represented by this I/O */
void *io_data;
+ void *io_orig_data;
uint64_t io_size;
+ uint64_t io_orig_size;
/* Stuff for the vdev stack */
vdev_t *io_vd;
void *io_vsd;
- zio_done_func_t *io_vsd_free;
+ const zio_vsd_ops_t *io_vsd_ops;
+
uint64_t io_offset;
uint64_t io_deadline;
avl_node_t io_offset_node;
@@ -318,15 +396,17 @@ struct zio {
avl_tree_t *io_vdev_tree;
/* Internal pipeline state */
- int io_flags;
- zio_stage_t io_stage;
- uint32_t io_pipeline;
- int io_orig_flags;
- zio_stage_t io_orig_stage;
- uint32_t io_orig_pipeline;
+ enum zio_flag io_flags;
+ enum zio_stage io_stage;
+ enum zio_stage io_pipeline;
+ enum zio_flag io_orig_flags;
+ enum zio_stage io_orig_stage;
+ enum zio_stage io_orig_pipeline;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@@ -336,6 +416,7 @@ struct zio {
kcondvar_t io_cv;
/* FMA state */
+ zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
#ifdef _KERNEL
@@ -346,47 +427,53 @@ struct zio {
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_root(spa_t *spa,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, zbookmark_t *zb);
+ int priority, enum zio_flag flags, zbookmark_t *zb);
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
@@ -407,11 +494,11 @@ extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
@@ -420,8 +507,12 @@ extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+ enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+ enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+ enum zio_compress parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
extern int zio_resume(spa_t *spa);
@@ -443,9 +534,30 @@ extern int zio_inject_fault(char *name, int flags, int *id,
extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+ uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index da407399da06..0956c04ab1b4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
@@ -43,28 +42,31 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
typedef struct zio_checksum_info {
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
+ int ci_eck; /* uses zio embedded checksum? */
+ int ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
+typedef struct zio_bad_cksum {
+ zio_cksum_t zbc_expected;
+ zio_cksum_t zbc_actual;
+ const char *zbc_checksum_name;
+ uint8_t zbc_byteswapped;
+ uint8_t zbc_injected;
+ uint8_t zbc_has_cksum; /* expected/actual valid */
+} zio_bad_cksum_t;
+
extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
extern zio_checksum_t zio_checksum_SHA256;
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
index 66ee8d45b3b6..30bed1a676e3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
@@ -20,15 +20,13 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zio.h>
#ifdef __cplusplus
@@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
/*
* Compress and decompress data if necessary.
*/
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
- void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
index e7503b733cc0..d90bd8bd5921 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,104 +34,136 @@ extern "C" {
#endif
/*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
*/
-typedef enum zio_stage {
- ZIO_STAGE_OPEN = 0, /* RWFCI */
+enum zio_stage {
+ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
- ZIO_STAGE_ISSUE_ASYNC, /* -W--- */
+ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
- ZIO_STAGE_READ_BP_INIT, /* R---- */
- ZIO_STAGE_WRITE_BP_INIT, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+ ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
- ZIO_STAGE_DVA_FREE, /* --F-- */
- ZIO_STAGE_DVA_CLAIM, /* ---C- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
- ZIO_STAGE_READY, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 15, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
- ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
- ZIO_STAGE_DONE, /* RWFCI */
- ZIO_STAGES
-} zio_stage_t;
+ ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
+};
-#define ZIO_INTERLOCK_STAGES \
- ((1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_INTERLOCK_STAGES \
+ (ZIO_STAGE_READY | \
+ ZIO_STAGE_DONE)
-#define ZIO_INTERLOCK_PIPELINE \
+#define ZIO_INTERLOCK_PIPELINE \
ZIO_INTERLOCK_STAGES
-#define ZIO_VDEV_IO_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_VDEV_IO_STAGES \
+ (ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_DONE | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DONE)
-#define ZIO_READ_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define ZIO_READ_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_CHECKSUM_VERIFY)
-#define ZIO_READ_PHYS_PIPELINE \
+#define ZIO_READ_PHYS_PIPELINE \
ZIO_READ_COMMON_STAGES
-#define ZIO_READ_PIPELINE \
- (ZIO_READ_COMMON_STAGES | \
- (1U << ZIO_STAGE_READ_BP_INIT))
+#define ZIO_READ_PIPELINE \
+ (ZIO_READ_COMMON_STAGES | \
+ ZIO_STAGE_READ_BP_INIT)
-#define ZIO_WRITE_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_ISSUE_ASYNC) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE))
-
-#define ZIO_WRITE_PHYS_PIPELINE \
- ZIO_WRITE_COMMON_STAGES
-
-#define ZIO_REWRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT))
-
-#define ZIO_WRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE))
-
-#define ZIO_GANG_STAGES \
- ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \
- (1U << ZIO_STAGE_GANG_ISSUE))
+#define ZIO_DDT_CHILD_READ_PIPELINE \
+ ZIO_READ_COMMON_STAGES
-#define ZIO_FREE_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_FREE))
+#define ZIO_DDT_READ_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_READ_BP_INIT | \
+ ZIO_STAGE_DDT_READ_START | \
+ ZIO_STAGE_DDT_READ_DONE)
-#define ZIO_CLAIM_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_CHECKSUM_GENERATE)
-#define ZIO_IOCTL_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
-#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE) | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_REWRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT)
+
+#define ZIO_WRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_CHILD_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_CHECKSUM_GENERATE | \
+ ZIO_STAGE_DDT_WRITE)
+
+#define ZIO_GANG_STAGES \
+ (ZIO_STAGE_GANG_ASSEMBLE | \
+ ZIO_STAGE_GANG_ISSUE)
+
+#define ZIO_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_DVA_FREE)
+
+#define ZIO_DDT_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_DDT_FREE)
+
+#define ZIO_CLAIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_DVA_CLAIM)
+
+#define ZIO_IOCTL_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_BLOCKING_STAGES \
+ (ZIO_STAGE_DVA_ALLOCATE | \
+ ZIO_STAGE_DVA_CLAIM | \
+ ZIO_STAGE_VDEV_IO_START)
extern void zio_inject_init(void);
extern void zio_inject_fini(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
new file mode 100644
index 000000000000..dcd63f7b5b91
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define zrl_add(_z) zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
index 2a6452aa433c..c0a0a69f71ca 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZVOL_H
#define _SYS_ZVOL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -43,26 +40,41 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
extern int zvol_check_volblocksize(uint64_t volblocksize);
extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, major_t);
+extern int zvol_create_minor(const char *);
extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
extern int zvol_set_volsize(const char *, major_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
+#ifdef sun
extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
-#ifndef __FreeBSD__
extern int zvol_strategy(buf_t *bp);
extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif
+#endif /* sun */
extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
int *rvalp);
extern int zvol_busy(void);
extern void zvol_init(void);
extern void zvol_fini(void);
+
+#ifdef sun
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+ ssize_t resid, boolean_t sync);
+#endif /* sun */
+
+#ifdef __FreeBSD__
+extern int zvol_create_minors(const char *name);
+#endif
+
#endif
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index c69c117500f0..0885f27116d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -19,14 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
#include <sys/callb.h>
/*
@@ -36,24 +37,13 @@
static void txg_sync_thread(void *arg);
static void txg_quiesce_thread(void *arg);
-int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */
-extern int zfs_txg_synctime;
-extern uint64_t zfs_write_limit_override;
+int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0,
- "ZFS transaction groups (TXG)");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
TUNABLE_INT("vfs.zfs.txg.timeout", &zfs_txg_timeout);
SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RDTUN, &zfs_txg_timeout, 0,
"Maximum seconds worth of delta per txg");
-TUNABLE_INT("vfs.zfs.txg.synctime", &zfs_txg_synctime);
-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime, CTLFLAG_RDTUN, &zfs_txg_synctime,
- 0, "Target seconds to sync a txg");
-TUNABLE_QUAD("vfs.zfs.txg.write_limit_override", &zfs_write_limit_override);
-SYSCTL_UQUAD(_vfs_zfs_txg, OID_AUTO, write_limit_override, CTLFLAG_RW,
- &zfs_write_limit_override, 0,
- "Override maximum size of a txg to this size in bytes, "
- "value of 0 means don't override");
/*
* Prepare the txg subsystem.
@@ -74,10 +64,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
for (i = 0; i < TXG_SIZE; i++) {
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
NULL);
+ list_create(&tx->tx_cpu[c].tc_callbacks[i],
+ sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
}
}
- rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
@@ -100,7 +92,6 @@ txg_fini(dsl_pool_t *dp)
ASSERT(tx->tx_threads == 0);
- rw_destroy(&tx->tx_suspend);
mutex_destroy(&tx->tx_sync_lock);
cv_destroy(&tx->tx_sync_more_cv);
@@ -113,10 +104,15 @@ txg_fini(dsl_pool_t *dp)
int i;
mutex_destroy(&tx->tx_cpu[c].tc_lock);
- for (i = 0; i < TXG_SIZE; i++)
+ for (i = 0; i < TXG_SIZE; i++) {
cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+ }
}
+ if (tx->tx_commit_cb_taskq != NULL)
+ taskq_destroy(tx->tx_commit_cb_taskq);
+
kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
bzero(tx, sizeof (tx_state_t));
@@ -196,7 +192,11 @@ txg_sync_stop(dsl_pool_t *dp)
* Finish off any work in progress.
*/
ASSERT(tx->tx_threads == 2);
- txg_wait_synced(dp, 0);
+
+ /*
+ * We need to ensure that we've vacated the deferred space_maps.
+ */
+ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
/*
* Wake all sync threads and wait for them to die.
@@ -246,6 +246,17 @@ txg_rele_to_quiesce(txg_handle_t *th)
}
void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+ mutex_exit(&tc->tc_lock);
+}
+
+void
txg_rele_to_sync(txg_handle_t *th)
{
tx_cpu_t *tc = th->th_cpu;
@@ -296,9 +307,61 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
}
static void
+txg_do_callbacks(void *arg)
+{
+ list_t *cb_list = arg;
+
+ dmu_tx_do_callbacks(cb_list, 0);
+
+ list_destroy(cb_list);
+
+ kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+ int c;
+ tx_state_t *tx = &dp->dp_tx;
+ list_t *cb_list;
+
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ /* No need to lock tx_cpu_t at this point */
+
+ int g = txg & TXG_MASK;
+
+ if (list_is_empty(&tc->tc_callbacks[g]))
+ continue;
+
+ if (tx->tx_commit_cb_taskq == NULL) {
+ /*
+ * Commit callback taskq hasn't been created yet.
+ */
+ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+ max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
+ TASKQ_PREPOPULATE);
+ }
+
+ cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(cb_list, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+
+ list_move_tail(&tc->tc_callbacks[g], cb_list);
+
+ (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+ txg_do_callbacks, cb_list, TQ_SLEEP);
+ }
+}
+
+static void
txg_sync_thread(void *arg)
{
dsl_pool_t *dp = arg;
+ spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
uint64_t start, delta;
@@ -311,20 +374,19 @@ txg_sync_thread(void *arg)
uint64_t txg;
/*
- * We sync when we're scrubbing, there's someone waiting
+ * We sync when we're scanning, there's someone waiting
* on us, or the quiesce thread has handed off a txg to
* us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
- while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
- spa_shutting_down(dp->dp_spa)) &&
+ while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
- delta = LBOLT - start;
+ delta = ddi_get_lbolt() - start;
timer = (delta > timeout ? 0 : timeout - delta);
}
@@ -342,8 +404,6 @@ txg_sync_thread(void *arg)
if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
- rw_enter(&tx->tx_suspend, RW_WRITER);
-
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
@@ -353,22 +413,24 @@ txg_sync_thread(void *arg)
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
- rw_exit(&tx->tx_suspend);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
- start = LBOLT;
- spa_sync(dp->dp_spa, txg);
- delta = LBOLT - start;
+ start = ddi_get_lbolt();
+ spa_sync(spa, txg);
+ delta = ddi_get_lbolt() - start;
mutex_enter(&tx->tx_sync_lock);
- rw_enter(&tx->tx_suspend, RW_WRITER);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
- rw_exit(&tx->tx_suspend);
cv_broadcast(&tx->tx_sync_done_cv);
+
+ /*
+ * Dispatch commit callbacks to worker threads.
+ */
+ txg_dispatch_callbacks(dp, txg);
}
}
@@ -426,7 +488,7 @@ void
txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
{
tx_state_t *tx = &dp->dp_tx;
- int timeout = LBOLT + ticks;
+ int timeout = ddi_get_lbolt() + ticks;
/* don't delay if this txg could transition to quiesing immediately */
if (tx->tx_open_txg > txg ||
@@ -439,10 +501,10 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
return;
}
- while (LBOLT < timeout &&
+ while (ddi_get_lbolt() < timeout &&
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
- timeout - LBOLT);
+ timeout - ddi_get_lbolt());
mutex_exit(&tx->tx_sync_lock);
}
@@ -455,7 +517,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
mutex_enter(&tx->tx_sync_lock);
ASSERT(tx->tx_threads == 2);
if (txg == 0)
- txg = tx->tx_open_txg;
+ txg = tx->tx_open_txg + TXG_DEFER_SIZE;
if (tx->tx_sync_txg_waiting < txg)
tx->tx_sync_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -506,21 +568,6 @@ txg_sync_waiting(dsl_pool_t *dp)
tx->tx_quiesced_txg != 0);
}
-void
-txg_suspend(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- /* XXX some code paths suspend when they are already suspended! */
- rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- rw_exit(&tx->tx_suspend);
-}
-
/*
* Per-txg object lists.
*/
@@ -578,6 +625,34 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
}
/*
+ * Add an entry to the end of the list (walks list to find end).
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ txg_node_t **tp;
+
+ for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+ continue;
+
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = NULL;
+ *tp = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
* Remove the head of the list and return it.
*/
void *
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
index 34d7e0c3ac74..692cda137f1a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/uberblock_impl.h>
#include <sys/vdev_impl.h>
@@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
ub->ub_txg = txg;
ub->ub_guid_sum = rvd->vdev_guid_sum;
ub->ub_timestamp = gethrestime_sec();
+ ub->ub_software_version = SPA_VERSION;
return (ub->ub_rootbp.blk_birth == txg);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index cb43af37f010..51a3c792deda 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -40,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/zil.h>
+#include <sys/dsl_scan.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -61,6 +61,7 @@ static vdev_ops_t *vdev_ops_table[] = {
#endif
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
@@ -95,9 +96,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
- uint64_t c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
@@ -106,40 +106,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
}
/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
*/
uint64_t
-vdev_get_rsize(vdev_t *vd)
+vdev_get_min_asize(vdev_t *vd)
{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
+ vdev_t *pvd = vd->vdev_parent;
+
+ /*
+ * The our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
- pvd = vd->vdev_parent;
+ /*
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
+ */
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
/*
- * If our parent is NULL or the root, just return our own psize.
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
*/
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return (pvd->vdev_min_asize / pvd->vdev_children);
- rsize = 0;
+ return (pvd->vdev_min_asize);
+}
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
- return (rsize);
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
@@ -160,13 +167,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
- int c;
vdev_t *mvd;
if (vd->vdev_guid == guid)
return (vd);
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL)
return (mvd);
@@ -212,9 +218,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
}
void
@@ -249,9 +252,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
}
/*
@@ -262,17 +262,17 @@ vdev_compact_children(vdev_t *pvd)
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
- int newc, c;
+ int newc;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- for (c = newc = 0; c < oldc; c++)
+ for (int c = newc = 0; c < oldc; c++)
if (pvd->vdev_child[c])
newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
- for (c = newc = 0; c < oldc; c++) {
+ for (int c = newc = 0; c < oldc; c++) {
if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd;
cvd->vdev_id = newc++;
@@ -287,7 +287,7 @@ vdev_compact_children(vdev_t *pvd)
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
@@ -299,21 +299,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
- while (guid == 0 || spa_guid_exists(guid, 0))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(NULL);
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
- while (guid == 0 ||
- spa_guid_exists(spa_guid(spa), guid))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(spa);
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
@@ -324,6 +321,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -384,6 +382,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
}
/*
@@ -400,6 +401,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
@@ -407,23 +411,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
- /*
- * Currently, we can only support 2 parity devices.
- */
- if (nparity == 0 || nparity > 2)
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (EINVAL);
/*
- * Older versions can only support 1 parity device.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (nparity == 2 &&
- spa_version(spa) < SPA_VERSION_RAID6)
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (ENOTSUP);
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
return (ENOTSUP);
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
- if (spa_version(spa) >= SPA_VERSION_RAID6)
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (EINVAL);
/*
* Otherwise, we default to 1 parity device for RAID-Z.
@@ -471,43 +476,86 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ if (parent && !parent->vdev_parent &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ }
+
+ if (parent && !parent->vdev_parent) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ vd->vdev_mg = metaslab_group_create(islog ?
+ spa_log_class(spa) : spa_normal_class(spa), vd);
}
/*
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
- (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ &vd->vdev_resilvering);
+
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
- * valid in the current context.
+ * valid in the current context. Local vdevs will
+ * remain in the faulted state.
*/
- if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ if (spa_load_state(spa) == SPA_LOAD_OPEN) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
}
}
@@ -524,7 +572,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
void
vdev_free(vdev_t *vd)
{
- int c;
spa_t *spa = vd->vdev_spa;
/*
@@ -534,11 +581,12 @@ vdev_free(vdev_t *vd)
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
*/
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL);
@@ -547,8 +595,10 @@ vdev_free(vdev_t *vd)
/*
* Discard allocation state.
*/
- if (vd == vd->vdev_top)
+ if (vd->vdev_mg != NULL) {
vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ }
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -668,14 +718,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
static void
vdev_top_update(vdev_t *tvd, vdev_t *vd)
{
- int c;
-
if (vd == NULL)
return;
vd->vdev_top = tvd;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_top_update(tvd, vd->vdev_child[c]);
}
@@ -694,8 +742,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
@@ -737,6 +787,7 @@ vdev_remove_parent(vdev_t *cvd)
*/
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
@@ -756,16 +807,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
- metaslab_class_t *mc;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -777,14 +834,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
ASSERT(oldc <= newc);
- if (vd->vdev_islog)
- mc = spa->spa_log_class;
- else
- mc = spa->spa_normal_class;
-
- if (vd->vdev_mg == NULL)
- vd->vdev_mg = metaslab_group_create(mc, vd);
-
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (oldc != 0) {
@@ -819,6 +868,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
}
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (oldc == 0 && !vd->vdev_removing)
+ metaslab_group_activate(vd->vdev_mg);
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
return (0);
}
@@ -829,6 +892,7 @@ vdev_metaslab_fini(vdev_t *vd)
uint64_t count = vd->vdev_ms_count;
if (vd->vdev_ms != NULL) {
+ metaslab_group_passivate(vd->vdev_mg);
for (m = 0; m < count; m++)
if (vd->vdev_ms[m] != NULL)
metaslab_fini(vd->vdev_ms[m]);
@@ -956,6 +1020,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -988,6 +1056,55 @@ vdev_probe(vdev_t *vd, zio_t *zio)
return (NULL);
}
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+ taskq_t *tq;
+ int children = vd->vdev_children;
+
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (B_TRUE || vdev_uses_zvols(vd)) {
+ for (int c = 0; c < children; c++)
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ return;
+ }
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+ TQ_SLEEP) != 0);
+
+ taskq_destroy(tq);
+}
+
/*
* Prepare a virtual device for access.
*/
@@ -996,13 +1113,12 @@ vdev_open(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int error;
- int c;
uint64_t osize = 0;
uint64_t asize, psize;
uint64_t ashift = 0;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1010,11 +1126,18 @@ vdev_open(vdev_t *vd)
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
- VDEV_AUX_ERR_EXCEEDED);
+ vd->vdev_label_aux);
return (ENXIO);
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
@@ -1024,6 +1147,11 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, NULL, ENXIO);
@@ -1039,20 +1167,40 @@ vdev_open(vdev_t *vd)
vd->vdev_removed = B_FALSE;
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (ENXIO);
+ }
+
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
- for (c = 0; c < vd->vdev_children; c++)
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
+ }
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
@@ -1077,6 +1225,15 @@ vdev_open(vdev_t *vd)
vd->vdev_psize = psize;
+ /*
+ * Make sure the allocatable size hasn't shrunk.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
@@ -1093,25 +1250,18 @@ vdev_open(vdev_t *vd)
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
+ }
- /*
- * Make sure the device hasn't shrunk.
- */
- if (asize < vd->vdev_asize) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth. If automatic
+ * expansion is enabled then use the additional space.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand))
+ vd->vdev_asize = asize;
- /*
- * If all children are healthy and the asize has increased,
- * then we've experienced dynamic LUN growth.
- */
- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
- asize > vd->vdev_asize) {
- vd->vdev_asize = asize;
- }
- }
+ vdev_set_min_asize(vd);
/*
* Ensure we can issue some IO before declaring the
@@ -1119,8 +1269,8 @@ vdev_open(vdev_t *vd)
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_IO_FAILURE);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
return (error);
}
@@ -1150,12 +1300,11 @@ int
vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- int c;
nvlist_t *label;
- uint64_t guid, top_guid;
+ uint64_t guid = 0, top_guid;
uint64_t state;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF);
@@ -1165,6 +1314,8 @@ vdev_validate(vdev_t *vd)
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ uint64_t aux_guid = 0;
+ nvlist_t *nvl;
if ((label = vdev_label_read_config(vd)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1172,6 +1323,18 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ return (0);
+ }
+
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&guid) != 0 || guid != spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1180,6 +1343,11 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
@@ -1187,12 +1355,16 @@ vdev_validate(vdev_t *vd)
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
- (vd->vdev_guid != guid &&
+ ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -1211,11 +1383,11 @@ vdev_validate(vdev_t *vd)
nvlist_free(label);
/*
- * If spa->spa_load_verbatim is true, no need to check the
+ * If this is a verbatim import, no need to check the
* state of the pool.
*/
- if (!spa->spa_load_verbatim &&
- spa->spa_load_state == SPA_LOAD_OPEN &&
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
@@ -1238,15 +1410,23 @@ void
vdev_close(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ vdev_t *pvd = vd->vdev_parent;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
/*
- * We record the previous state before we close it, so that if we are
+ * We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
@@ -1260,12 +1440,49 @@ vdev_close(vdev_t *vd)
}
void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
vdev_close(vd);
(void) vdev_open(vd);
@@ -1278,12 +1495,8 @@ vdev_reopen(vdev_t *vd)
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
vd->vdev_aux == &spa->spa_l2cache &&
- !l2arc_vdev_present(vd)) {
- uint64_t size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ !l2arc_vdev_present(vd))
+ l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd);
}
@@ -1323,33 +1536,23 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
return (0);
}
-/*
- * The is the latter half of vdev_create(). It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
void
-vdev_init(vdev_t *vd, uint64_t txg)
+vdev_metaslab_set_size(vdev_t *vd)
{
/*
* Aim for roughly 200 metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
- /*
- * Initialize the vdev's metaslabs. This can't fail because
- * there's nothing to read when creating all new metaslabs.
- */
- VERIFY(vdev_metaslab_init(vd, txg) == 0);
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1364,7 +1567,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication. There are three kinds of DTL:
+ * the vdev has less than perfect replication. There are four kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
@@ -1405,6 +1608,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size))
@@ -1458,14 +1662,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
- (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
- /* XXX should check scrub_done? */
+ (spa->spa_scrub_started ||
+ (scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -1550,6 +1756,8 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
@@ -1577,6 +1785,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
@@ -1655,6 +1865,9 @@ vdev_dtl_required(vdev_t *vd)
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ if (!required && zio_injection_enabled)
+ required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
return (required);
}
@@ -1713,7 +1926,7 @@ vdev_load(vdev_t *vd)
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1770,11 +1983,49 @@ vdev_validate_aux(vdev_t *vd)
}
void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
+void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+ ASSERT(!vd->vdev_ishole);
+
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
@@ -1790,6 +2041,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1800,6 +2053,12 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1822,11 +2081,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
* not be opened, and no I/O is attempted.
*/
int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1834,19 +2093,28 @@ vdev_fault(spa_t *spa, uint64_t guid)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+
+ /*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
/*
* Faulted state takes precedence over degraded.
*/
+ vd->vdev_delayed_close = B_FALSE;
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
- vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
/*
- * If marking the vdev as faulted cause the top-level vdev to become
- * unavailable, then back off and simply mark the vdev as degraded
- * instead.
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
*/
- if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -1854,12 +2122,10 @@ vdev_fault(spa_t *spa, uint64_t guid)
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
- vdev_reopen(vd);
+ vdev_reopen(tvd);
- if (vdev_readable(vd)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
- }
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
}
return (spa_vdev_state_exit(spa, vd, 0));
@@ -1871,11 +2137,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
* as I/O is concerned.
*/
int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1892,7 +2158,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
+ aux);
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -1906,9 +2172,9 @@ vdev_degrade(spa_t *spa, uint64_t guid)
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1916,13 +2182,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
- vdev_reopen(vd->vdev_top);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+ }
+
+ vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
@@ -1931,16 +2210,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *vd, *tvd;
- int error;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
- spa_vdev_state_enter(spa);
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1949,6 +2238,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
/*
* If the device isn't already offline, try to offline it.
@@ -1964,6 +2255,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_offline_log(spa);
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+ }
+
+ /*
* Offline this device and reopen its top-level vdev.
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
@@ -1978,28 +2300,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
- if (!tvd->vdev_islog || !vdev_is_dead(tvd))
- return (spa_vdev_state_exit(spa, vd, 0));
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
- (void) spa_vdev_state_exit(spa, vd, 0);
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
- NULL, DS_FIND_CHILDREN);
- if (error) {
- (void) vdev_online(spa, guid, 0, NULL);
- return (error);
- }
- /*
- * If we successfully offlined the log device then we need to
- * sync out the current txg so that the "stubby" block can be
- * removed by zil_sync().
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
- return (0);
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
}
/*
@@ -2033,13 +2357,22 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
- vd->vdev_faulted = vd->vdev_degraded = 0;
+ /*
+ * When reopening in reponse to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
- vdev_reopen(vd);
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
- if (vd != rvd)
+ vd->vdev_forcefault = B_FALSE;
+
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2047,12 +2380,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
}
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
@@ -2081,7 +2432,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole);
}
boolean_t
@@ -2111,10 +2462,11 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_rsize(vd);
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
mutex_exit(&vd->vdev_stat_lock);
/*
@@ -2131,7 +2483,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ cvs->vs_scan_removing = cvd->vdev_removing;
mutex_exit(&vd->vdev_stat_lock);
}
}
@@ -2148,6 +2500,19 @@ vdev_clear_stats(vdev_t *vd)
}
void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
spa_t *spa = zio->io_spa;
@@ -2191,8 +2556,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += psize;
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -2217,6 +2591,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
@@ -2230,14 +2612,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD))) {
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
+ spa->spa_claiming)) {
/*
- * This is either a normal write (not a repair), or it's a
- * repair induced by the scrub thread. In the normal case,
- * we commit the DTL change in the same txg as the block
- * was born. In the scrub-induced repair case, we know that
- * scrubs run in first-pass syncing context, so we commit
- * the DTL change in spa->spa_syncing_txg.
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
@@ -2246,13 +2631,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
- commit_txg = spa->spa_syncing_txg;
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
}
- ASSERT(commit_txg >= spa->spa_syncing_txg);
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2264,46 +2652,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- int c;
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
- boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -2319,28 +2680,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (update_root) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
- return;
-
+ if (mc == spa_normal_class(spa)) {
mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
+
+ if (mc != NULL) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc,
+ alloc_delta, defer_delta, space_delta, dspace_delta);
+ }
}
/*
@@ -2355,6 +2714,8 @@ vdev_config_dirty(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int c;
+ ASSERT(spa_writeable(spa));
+
/*
* If this is an aux vdev (as with l2cache and spare devices), then we
* update the vdev config manually and set the sync flag.
@@ -2392,7 +2753,7 @@ vdev_config_dirty(vdev_t *vd)
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
- aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
@@ -2413,7 +2774,8 @@ vdev_config_dirty(vdev_t *vd)
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
@@ -2442,6 +2804,7 @@ vdev_state_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
@@ -2454,7 +2817,7 @@ vdev_state_dirty(vdev_t *vd)
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node))
+ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
@@ -2481,13 +2844,18 @@ vdev_propagate_state(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
- int c;
vdev_t *child;
if (vd->vdev_children > 0) {
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
@@ -2551,15 +2919,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
/*
* If we are setting the vdev state to anything but an open state, then
- * always close the underlying device. Otherwise, we keep accessible
- * but invalid devices open forever. We don't call vdev_close() itself,
- * because that implies some extra checks (offline, etc) that we don't
- * want here. This is limited to leaf devices, because otherwise
- * closing the device will affect other children.
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
+ /*
+ * If we have brought this vdev back into service, we need
+ * to notify fmd so that it can gracefully repair any outstanding
+ * cases due to a missing device. We do this in all cases, even those
+ * that probably don't correlate to a repaired fault. This is sure to
+ * catch all cases, and we let the zfs-retire agent sort it out. If
+ * this is a transient state it's OK, as the retire agent will
+ * double-check the state of the vdev before repairing it.
+ */
+ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_prevstate != state)
+ zfs_post_state_change(spa, vd);
+
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2575,20 +2959,16 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
- /*
- * Indicate to the ZFS DE that this device has been removed, and
- * any recent errors should be ignored.
- */
- zfs_post_remove(spa, vd);
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
- * If we fail to open a vdev during an import, we mark it as
- * "not available", which signifies that it was never there to
- * begin with. Failure to open such a device is not considered
- * an error.
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
*/
- if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
@@ -2631,9 +3011,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
- case VDEV_AUX_IO_FAILURE:
- class = FM_EREPORT_ZFS_IO_FAILURE;
- break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
@@ -2682,7 +3059,7 @@ vdev_is_bootable(vdev_t *vd)
return (B_FALSE);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
@@ -2690,31 +3067,84 @@ vdev_is_bootable(vdev_t *vd)
return (B_TRUE);
}
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
+ */
void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t c, children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(nvd->vdev_top->vdev_islog);
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+ for (int c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf) {
/*
- * It would be nice to call vdev_offline()
- * directly but the pool isn't fully loaded and
- * the txg threads have not been started yet.
+ * Restore the persistent vdev state
*/
- spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
- vd->vdev_offline = val;
- vdev_reopen(vd->vdev_top);
- spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ nvd->vdev_offline = ovd->vdev_offline;
+ nvd->vdev_faulted = ovd->vdev_faulted;
+ nvd->vdev_degraded = ovd->vdev_degraded;
+ nvd->vdev_removed = ovd->vdev_removed;
+ }
+}
+
+/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
}
+ vdev_propagate_state(cvd);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 8fc3738cab37..7978d612501a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -184,7 +184,7 @@ vdev_cache_allocate(zio_t *zio)
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
- ve->ve_lastused = LBOLT;
+ ve->ve_lastused = ddi_get_lbolt();
ve->ve_data = zio_buf_alloc(VCBS);
avl_add(&vc->vc_offset_tree, ve);
@@ -201,9 +201,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL);
- if (ve->ve_lastused != LBOLT) {
+ if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
- ve->ve_lastused = LBOLT;
+ ve->ve_lastused = ddi_get_lbolt();
avl_add(&vc->vc_lastused_tree, ve);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 5db7a6aec2f6..d7417736b4ee 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -19,12 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
@@ -44,12 +43,71 @@ typedef struct vdev_disk_buf {
zio_t *vdb_io;
} vdev_disk_buf_t;
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+ ddi_devid_t devid;
+ char *minor;
+
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+ return;
+
+ /*
+ * Only prefetch path and devid info if the device has
+ * never been opened.
+ */
+ if (vd->vdev_tsd != NULL)
+ return;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ (void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
+ kmem_free(buf, len);
+ }
+
+ if (vd->vdev_name_vp == NULL)
+ (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
+
+ if (vd->vdev_devid != NULL &&
+ ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
+ (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
+ ddi_devid_str_free(minor);
+ ddi_devid_free(devid);
+ }
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ if (vd->vdev_name_vp) {
+ VN_RELE_ASYNC(vd->vdev_name_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_name_vp = NULL;
+ }
+ if (vd->vdev_devid_vp) {
+ VN_RELE_ASYNC(vd->vdev_devid_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_devid_vp = NULL;
+ }
+}
+
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
spa_t *spa = vd->vdev_spa;
vdev_disk_t *dvd;
- struct dk_minfo dkm;
+ struct dk_minfo_ext dkmext;
int error;
dev_t dev;
int otyp;
@@ -62,6 +120,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
return (EINVAL);
}
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ dvd = vd->vdev_tsd;
+ goto skip_open;
+ }
+
dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
/*
@@ -79,12 +147,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
*
* 3. Otherwise, the device may have moved. Try opening the device
* by the devid instead.
- *
- * If the vdev is part of the root pool, we avoid opening it by path.
- * We do this because there is no /dev path available early in boot,
- * and if we try to open the device by path at a later point, we can
- * deadlock when devfsadm attempts to open the underlying backing store
- * file.
*/
if (vd->vdev_devid != NULL) {
if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
@@ -96,7 +158,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
error = EINVAL; /* presume failure */
- if (vd->vdev_path != NULL && !spa_is_root(spa)) {
+ if (vd->vdev_path != NULL) {
ddi_devid_t devid;
if (vd->vdev_wholedisk == -1ULL) {
@@ -167,7 +229,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* as above. This hasn't been used in a very long time and we
* don't need to propagate its oddities to this edge condition.
*/
- if (error && vd->vdev_path != NULL && !spa_is_root(spa))
+ if (error && vd->vdev_path != NULL)
error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
kcred, &dvd->vd_lh, zfs_li);
}
@@ -202,6 +264,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
kmem_free(physpath, MAXPATHLEN);
}
+skip_open:
/*
* Determine the actual size of the device.
*/
@@ -224,11 +287,11 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* Determine the device's minimum transfer size.
* If the ioctl isn't supported, assume DEV_BSIZE.
*/
- if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
FKIOCTL, kcred, NULL) != 0)
- dkm.dki_lbsize = DEV_BSIZE;
+ dkmext.dki_pbsize = DEV_BSIZE;
- *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
+ *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
/*
* Clear the nowritecache bit, so that on a vdev_reopen() we will
@@ -244,7 +307,7 @@ vdev_disk_close(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
- if (dvd == NULL)
+ if (vd->vdev_reopening || dvd == NULL)
return;
if (dvd->vd_minor != NULL)
@@ -256,6 +319,7 @@ vdev_disk_close(vdev_t *vd)
if (dvd->vd_lh != NULL)
(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+ vd->vdev_delayed_close = B_FALSE;
kmem_free(dvd, sizeof (vdev_disk_t));
vd->vdev_tsd = NULL;
}
@@ -315,6 +379,11 @@ vdev_disk_ioctl_free(zio_t *zio)
kmem_free(zio->io_vsd, sizeof (struct dk_callback));
}
+static const zio_vsd_ops_t vdev_disk_vsd_ops = {
+ vdev_disk_ioctl_free,
+ zio_vsd_default_cksum_report
+};
+
static void
vdev_disk_ioctl_done(void *zio_arg, int error)
{
@@ -355,7 +424,7 @@ vdev_disk_io_start(zio_t *zio)
}
zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
- zio->io_vsd_free = vdev_disk_ioctl_free;
+ zio->io_vsd_ops = &vdev_disk_vsd_ops;
dkc->dkc_callback = vdev_disk_ioctl_done;
dkc->dkc_flag = FLUSH_VOLATILE;
@@ -427,14 +496,23 @@ vdev_disk_io_done(zio_t *zio)
* asynchronous removal of the device. Otherwise, probe the device and
* make sure it's still accessible.
*/
- if (zio->io_error == EIO) {
+ if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
vdev_disk_t *dvd = vd->vdev_tsd;
int state = DKIO_NONE;
if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+ /*
+ * We post the resource as soon as possible, instead of
+ * when the async removal actually happens, because the
+ * DE is using this information to discard previous I/O
+ * errors.
+ */
+ zfs_post_remove(zio->io_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
}
}
}
@@ -446,6 +524,8 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
+ vdev_disk_hold,
+ vdev_disk_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -488,6 +568,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+ *config = NULL;
for (l = 0; l < VDEV_LABELS; l++) {
uint64_t offset, state, txg = 0;
@@ -522,6 +603,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
kmem_free(label, sizeof (vdev_label_t));
(void) ldi_close(vd_lh, FREAD, kcred);
+ if (*config == NULL)
+ error = EIDRM;
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index 67bd110cd884..be3cefced741 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,6 +34,18 @@
* Virtual device vector for files.
*/
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
@@ -51,6 +62,17 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
return (EINVAL);
}
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ vp = vf->vf_vnode;
+ goto skip_open;
+ }
+
vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
/*
@@ -65,6 +87,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (error);
}
@@ -77,9 +101,13 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (vp->v_type != VREG) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (ENODEV);
}
#endif
+
+skip_open:
/*
* Determine the physical size of the file.
*/
@@ -92,6 +120,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if (error) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
return (error);
}
@@ -106,12 +136,15 @@ vdev_file_close(vdev_t *vd)
{
vdev_file_t *vf = vd->vdev_tsd;
- if (vf == NULL)
+ if (vd->vdev_reopening || vf == NULL)
return;
- if (vf->vf_vnode != NULL)
+ if (vf->vf_vnode != NULL) {
(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
kcred, NULL);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
kmem_free(vf, sizeof (vdev_file_t));
vd->vdev_tsd = NULL;
}
@@ -168,6 +201,8 @@ vdev_ops_t vdev_file_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
+ vdev_file_hold,
+ vdev_file_rele,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -184,6 +219,8 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
+ vdev_file_hold,
+ vdev_file_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index fa42871ebd5d..4d4b63cb2a07 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -47,31 +47,39 @@ struct g_class zfs_vdev_class = {
DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+/*
+ * Don't send BIO_FLUSH.
+ */
+static int vdev_geom_bio_flush_disable = 0;
+TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
+ &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+
static void
vdev_geom_orphan(struct g_consumer *cp)
{
- struct g_geom *gp;
vdev_t *vd;
- int error;
g_topology_assert();
vd = cp->private;
- gp = cp->geom;
- error = cp->provider->error;
- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
- if (cp->acr + cp->acw + cp->ace > 0)
- g_access(cp, -cp->acr, -cp->acw, -cp->ace);
- ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
- g_detach(cp);
- g_destroy_consumer(cp);
- /* Destroy geom if there are no consumers left. */
- if (LIST_EMPTY(&gp->consumer)) {
- ZFS_LOG(1, "Destroyed geom %s.", gp->name);
- g_wither_geom(gp, error);
- }
- vd->vdev_tsd = NULL;
+ /*
+ * Orphan callbacks occur from the GEOM event thread.
+ * Concurrent with this call, new I/O requests may be
+ * working their way through GEOM about to find out
+ * (only once executed by the g_down thread) that we've
+ * been orphaned from our disk provider. These I/Os
+ * must be retired before we can detach our consumer.
+ * This is most easily achieved by acquiring the
+ * SPA ZIO configuration lock as a writer, but doing
+ * so with the GEOM topology lock held would cause
+ * a lock order reversal. Instead, rely on the SPA's
+ * async removal support to invoke a close on this
+ * vdev once it is safe to do so.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
}
@@ -223,16 +231,12 @@ vdev_geom_read_guid(struct g_consumer *cp)
uint64_t psize;
off_t offset, size;
uint64_t guid;
- int error, l, len, iszvol;
+ int error, l, len;
g_topology_assert_not();
pp = cp->provider;
ZFS_LOG(1, "Reading guid from %s...", pp->name);
- if (g_getattr("ZFS::iszvol", cp, &iszvol) == 0 && iszvol) {
- ZFS_LOG(1, "Skipping ZVOL-based provider %s.", pp->name);
- return (0);
- }
psize = pp->mediasize;
psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
@@ -270,11 +274,6 @@ vdev_geom_read_guid(struct g_consumer *cp)
return (guid);
}
-struct vdev_geom_find {
- uint64_t guid;
- struct g_consumer *cp;
-};
-
static void
vdev_geom_taste_orphan(struct g_consumer *cp)
{
@@ -283,25 +282,23 @@ vdev_geom_taste_orphan(struct g_consumer *cp)
cp->provider->name));
}
-static void
-vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
+static struct g_consumer *
+vdev_geom_attach_by_guid(uint64_t guid)
{
- struct vdev_geom_find *ap;
struct g_class *mp;
struct g_geom *gp, *zgp;
struct g_provider *pp;
- struct g_consumer *zcp;
- uint64_t guid;
+ struct g_consumer *cp, *zcp;
+ uint64_t pguid;
g_topology_assert();
- ap = arg;
-
zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
/* This orphan function should be never called. */
zgp->orphan = vdev_geom_taste_orphan;
zcp = g_new_consumer(zgp);
+ cp = NULL;
LIST_FOREACH(mp, &g_classes, class) {
if (mp == &zfs_vdev_class)
continue;
@@ -317,39 +314,29 @@ vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
continue;
}
g_topology_unlock();
- guid = vdev_geom_read_guid(zcp);
+ pguid = vdev_geom_read_guid(zcp);
g_topology_lock();
g_access(zcp, -1, 0, 0);
g_detach(zcp);
- if (guid != ap->guid)
+ if (pguid != guid)
continue;
- ap->cp = vdev_geom_attach(pp);
- if (ap->cp == NULL) {
+ cp = vdev_geom_attach(pp);
+ if (cp == NULL) {
printf("ZFS WARNING: Unable to attach to %s.\n",
pp->name);
continue;
}
- goto end;
+ break;
}
+ if (cp != NULL)
+ break;
}
+ if (cp != NULL)
+ break;
}
- ap->cp = NULL;
end:
g_destroy_consumer(zcp);
g_destroy_geom(zgp);
-}
-
-static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid)
-{
- struct vdev_geom_find *ap;
- struct g_consumer *cp;
-
- ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
- ap->guid = guid;
- g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL);
- cp = ap->cp;
- kmem_free(ap, sizeof(*ap));
return (cp);
}
@@ -360,6 +347,8 @@ vdev_geom_open_by_guid(vdev_t *vd)
char *buf;
size_t len;
+ g_topology_assert();
+
ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
cp = vdev_geom_attach_by_guid(vd->vdev_guid);
if (cp != NULL) {
@@ -387,8 +376,9 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
struct g_consumer *cp;
uint64_t guid;
+ g_topology_assert();
+
cp = NULL;
- g_topology_lock();
pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
if (pp != NULL) {
ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
@@ -410,7 +400,6 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
}
}
}
- g_topology_unlock();
return (cp);
}
@@ -420,7 +409,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
struct g_provider *pp;
struct g_consumer *cp;
- int error, owned;
+ size_t bufsize;
+ int error, lock;
/*
* We must have a pathname, and it must be absolute.
@@ -432,15 +422,22 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
vd->vdev_tsd = NULL;
- if ((owned = mtx_owned(&Giant)))
- mtx_unlock(&Giant);
+ if (mutex_owned(&spa_namespace_lock)) {
+ mutex_exit(&spa_namespace_lock);
+ lock = 1;
+ } else {
+ lock = 0;
+ }
+ DROP_GIANT();
+ g_topology_lock();
error = 0;
/*
- * If we're creating pool, just find GEOM provider by its name
- * and ignore GUID mismatches.
+ * If we're creating or splitting a pool, just find the GEOM provider
+ * by its name and ignore GUID mismatches.
*/
- if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_splitting_newspa == B_TRUE)
cp = vdev_geom_open_by_path(vd, 0);
else {
cp = vdev_geom_open_by_path(vd, 1);
@@ -472,7 +469,6 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
int i;
- g_topology_lock();
for (i = 0; i < 5; i++) {
error = g_access(cp, 0, 1, 0);
if (error == 0)
@@ -487,10 +483,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
vdev_geom_detach(cp, 0);
cp = NULL;
}
- g_topology_unlock();
}
- if (owned)
- mtx_lock(&Giant);
+ g_topology_unlock();
+ PICKUP_GIANT();
+ if (lock)
+ mutex_enter(&spa_namespace_lock);
if (cp == NULL) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
@@ -516,6 +513,12 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
*/
vd->vdev_nowritecache = B_FALSE;
+ if (vd->vdev_physpath != NULL)
+ spa_strfree(vd->vdev_physpath);
+ bufsize = sizeof("/dev/") + strlen(pp->name);
+ vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
+ snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
+
return (0);
}
@@ -528,30 +531,50 @@ vdev_geom_close(vdev_t *vd)
if (cp == NULL)
return;
vd->vdev_tsd = NULL;
+ vd->vdev_delayed_close = B_FALSE;
g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
}
static void
vdev_geom_io_intr(struct bio *bp)
{
+ vdev_t *vd;
zio_t *zio;
zio = bp->bio_caller1;
+ vd = zio->io_vd;
zio->io_error = bp->bio_error;
if (zio->io_error == 0 && bp->bio_resid != 0)
zio->io_error = EIO;
if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
- vdev_t *vd;
-
/*
* If we get ENOTSUP, we know that no future
* attempts will ever succeed. In this case we
* set a persistent bit so that we don't bother
* with the ioctl in the future.
*/
- vd = zio->io_vd;
vd->vdev_nowritecache = B_TRUE;
}
+ if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
+ /*
+ * If provider's error is set we assume it is being
+ * removed.
+ */
+ if (bp->bio_to->error != 0) {
+ /*
+ * We post the resource as soon as possible, instead of
+ * when the async removal actually happens, because the
+ * DE is using this information to discard previous I/O
+ * errors.
+ */
+ /* XXX: zfs_post_remove() can sleep. */
+ zfs_post_remove(zio->io_spa, vd);
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
g_destroy_bio(bp);
zio_interrupt(zio);
}
@@ -577,7 +600,7 @@ vdev_geom_io_start(zio_t *zio)
case DKIOCFLUSHWRITECACHE:
- if (zfs_nocacheflush)
+ if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
break;
if (vd->vdev_nowritecache) {
@@ -628,6 +651,16 @@ vdev_geom_io_done(zio_t *zio)
{
}
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
vdev_ops_t vdev_geom_ops = {
vdev_geom_open,
vdev_geom_close,
@@ -635,6 +668,8 @@ vdev_ops_t vdev_geom_ops = {
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
+ vdev_geom_hold,
+ vdev_geom_rele,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index 48d5fc232b34..c08ed8ba0467 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -141,6 +140,7 @@
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
/*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare, boolean_t isl2cache)
+ vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare && !isl2cache)
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
== 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= SPA_VERSION_RAID6));
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
@@ -268,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
- if (!isspare && !isl2cache && vd == vd->vdev_top) {
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -279,42 +282,80 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
vd->vdev_islog) == 0);
+ if (vd->vdev_removing)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing) == 0);
}
if (vd->vdev_dtl_smo.smo_object != 0)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_crtxg)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ vd->vdev_crtxg) == 0);
+
if (getstats) {
vdev_stat_t vs;
+ pool_scan_stat_t ps;
+
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+ /* provide either current or previous scan information */
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ VERIFY(nvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+ == 0);
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c;
+ int c, idx;
+
+ ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare, isl2cache);
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ }
- for (c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
} else {
+ const char *aux = NULL;
+
if (vd->vdev_offline && !vd->vdev_tmpoffline)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
B_TRUE) == 0);
+ if (vd->vdev_resilvering)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ B_TRUE) == 0);
if (vd->vdev_faulted)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
B_TRUE) == 0);
@@ -327,11 +368,66 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_unspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
B_TRUE) == 0);
+ if (vd->vdev_ishole)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+ B_TRUE) == 0);
+
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+ aux) == 0);
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid) == 0);
+ }
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
nvlist_t *
vdev_label_read_config(vdev_t *vd)
{
@@ -478,6 +574,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
return (B_TRUE);
/*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
+ /*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
@@ -514,6 +619,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
crtxg, reason)) != 0)
return (error);
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
if (!vd->vdev_ops->vdev_op_leaf)
return (0);
@@ -526,7 +634,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Determine if the vdev is in use.
*/
- if (reason != VDEV_LABEL_REMOVE &&
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (EBUSY);
@@ -552,7 +660,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
*/
if (reason == VDEV_LABEL_SPARE)
return (0);
- ASSERT(reason == VDEV_LABEL_REPLACE);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
}
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
@@ -617,7 +726,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else {
- label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
/*
* Add our creation time. This allows us to detect multiple
@@ -642,8 +755,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+ bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
@@ -672,11 +785,9 @@ retry:
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
- }
+ vdev_label_write(zio, vd, l, ub,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
@@ -688,7 +799,7 @@ retry:
nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
@@ -717,11 +828,6 @@ retry:
*/
/*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
-/*
* Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we
* come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -750,6 +856,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
static void
vdev_uberblock_load_done(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data;
uberblock_t *ubbest = rio->io_private;
@@ -758,7 +865,7 @@ vdev_uberblock_load_done(zio_t *zio)
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
- if (ub->ub_txg <= ub_max_txg &&
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, ubbest) > 0)
*ubbest = *ub;
mutex_exit(&rio->io_lock);
@@ -976,6 +1083,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP);
+
+ ASSERT(!vd->vdev_ishole);
+
zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index fff7e0842256..698c0275d34e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio)
kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
}
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+ vdev_mirror_map_free,
+ zio_vsd_default_cksum_report
+};
+
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
@@ -117,28 +122,28 @@ vdev_mirror_map_alloc(zio_t *zio)
}
zio->io_vsd = mm;
- zio->io_vsd_free = vdev_mirror_map_free;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
return (mm);
}
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- vdev_t *cvd;
- uint64_t c;
int numerrors = 0;
- int ret, lasterror = 0;
+ int lasterror = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_open_children(vd);
- if ((ret = vdev_open(cvd)) != 0) {
- lasterror = ret;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
@@ -158,9 +163,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
static void
vdev_mirror_close(vdev_t *vd)
{
- uint64_t c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
@@ -211,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio)
uint64_t txg = zio->io_txg;
int i, c;
- ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
/*
* Try to find a child whose DTL doesn't contain the block to read.
@@ -449,6 +452,8 @@ vdev_ops_t vdev_mirror_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -460,6 +465,8 @@ vdev_ops_t vdev_replacing_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -471,6 +478,8 @@ vdev_ops_t vdev_spare_ops = {
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dcec9..6a5588d59213 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
- *psize = SPA_MINDEVSIZE;
- *ashift = SPA_MINBLOCKSHIFT;
+ *psize = 0;
+ *ashift = 0;
return (0);
}
@@ -80,6 +80,21 @@ vdev_ops_t vdev_missing_ops = {
vdev_missing_io_start,
vdev_missing_io_done,
NULL,
+ NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
+
+vdev_ops_t vdev_hole_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ NULL,
+ NULL,
+ VDEV_TYPE_HOLE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index de3f1db75961..b44f3b289d9f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -24,7 +24,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
@@ -41,37 +40,48 @@
int zfs_vdev_max_pending = 10;
int zfs_vdev_min_pending = 4;
-/* deadline = pri + (LBOLT >> time_shift) */
+/* deadline = pri + ddi_get_lbolt64() >> time_shift) */
int zfs_vdev_time_shift = 6;
/* exponential I/O issue ramp-up rate */
int zfs_vdev_ramp_rate = 2;
/*
- * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
- * For read i/os, we also aggregate across small adjacency gaps.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
SYSCTL_DECL(_vfs_zfs_vdev);
TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW,
&zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW,
&zfs_vdev_min_pending, 0,
"Initial number of I/O requests pending to each device");
TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW,
&zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW,
&zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW,
&zfs_vdev_aggregation_limit, 0,
"I/O requests are aggregated up to this size");
+TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW,
+ &zfs_vdev_read_gap_limit, 0,
+ "Acceptable gap between two reads being aggregated");
+TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW,
+ &zfs_vdev_write_gap_limit, 0,
+ "Acceptable gap between two writes being aggregated");
/*
* Virtual device vector for disk I/O scheduling.
@@ -191,12 +201,14 @@ vdev_queue_agg_io_done(zio_t *aio)
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
- zio_t *fio, *lio, *aio, *dio, *nio;
+ zio_t *fio, *lio, *aio, *dio, *nio, *mio;
avl_tree_t *t;
int flags;
uint64_t maxspan = zfs_vdev_aggregation_limit;
uint64_t maxgap;
+ int stretch;
+again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
@@ -211,21 +223,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
- * We can aggregate I/Os that are adjacent and of the
- * same flavor, as expressed by the AGG_INHERIT flags.
- * The latter is necessary so that certain attributes
- * of the I/O, such as whether it's a normal I/O or a
- * scrub/resilver, can be preserved in the aggregate.
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
+
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-option I/O.
*/
while ((dio = AVL_PREV(t, fio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+ IO_SPAN(dio, lio) <= maxspan &&
+ IO_GAP(dio, fio) <= maxgap) {
fio = dio;
+ if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = fio;
+ }
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+ fio = AVL_NEXT(t, fio);
+ ASSERT(fio != NULL);
+ }
+
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ */
while ((dio = AVL_NEXT(t, lio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+ IO_SPAN(fio, dio) <= maxspan &&
+ IO_GAP(lio, dio) <= maxgap) {
lio = dio;
+ if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = lio;
+ }
+
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ stretch = B_FALSE;
+ if (t != &vq->vq_read_tree && mio != NULL) {
+ nio = lio;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (stretch) {
+ /* This may be a no-op. */
+ VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ while (lio != mio && lio != fio) {
+ ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+ lio = AVL_PREV(t, lio);
+ ASSERT(lio != NULL);
+ }
+ }
}
if (fio != lio) {
@@ -244,10 +323,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == t);
- if (dio->io_type == ZIO_TYPE_WRITE)
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+ bzero((char *)aio->io_data + (dio->io_offset -
+ aio->io_offset), dio->io_size);
+ } else if (dio->io_type == ZIO_TYPE_WRITE) {
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
+ }
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
@@ -263,6 +347,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
ASSERT(fio->io_vdev_tree == t);
vdev_queue_io_remove(vq, fio);
+ /*
+ * If the I/O is or was optional and therefore has no data, we need to
+ * simply discard it. We need to drop the vdev queue's lock to avoid a
+ * deadlock that we could encounter since this I/O will complete
+ * immediately.
+ */
+ if (fio->io_flags & ZIO_FLAG_NODATA) {
+ mutex_exit(&vq->vq_lock);
+ zio_vdev_io_bypass(fio);
+ zio_execute(fio);
+ mutex_enter(&vq->vq_lock);
+ goto again;
+ }
+
avl_add(&vq->vq_pending_tree, fio);
return (fio);
@@ -288,7 +386,8 @@ vdev_queue_io(zio_t *zio)
mutex_enter(&vq->vq_lock);
- zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
+ zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+ zio->io_priority;
vdev_queue_io_add(vq, zio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 92753d8714c0..4b0f5602c1d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,12 +34,27 @@
/*
* Virtual device vector for RAID-Z.
*
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
*
* o addition (+) is represented by a bitwise XOR
* o subtraction (-) is therefore identical to addition: A + B = A - B
@@ -55,22 +69,32 @@
* (A * 2)_0 = A_7
*
* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
*
* Observe that any number in the field (except for 0) can be expressed as a
* power of 2 -- a generator for the field. We store a table of the powers of
* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
*
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
*
* P = D_0 + D_1 + ... + D_n-2 + D_n-1
* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
+ *
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
*
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
*/
typedef struct raidz_col {
@@ -78,27 +102,60 @@ typedef struct raidz_col {
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
+ void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;
typedef struct raidz_map {
- uint64_t rm_cols; /* Column count */
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_nskip; /* Skipped sectors for padding */
+ uint64_t rm_skipstart; /* Column index of padding start */
+ void *rm_datacopy; /* rm_asize-buffer of copied data */
+ uintptr_t rm_reports; /* # of referencing checksum reports */
+ uint8_t rm_freed; /* map no longer has referencing ZIO */
+ uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
-#define VDEV_RAIDZ_MAXPARITY 2
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
-#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
+
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
/*
* These two tables represent powers and logs of 2 in the Galois field defined
@@ -173,6 +230,8 @@ static const uint8_t vdev_raidz_log2[256] = {
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
/*
* Multiply a given number by 2 raised to the given power.
*/
@@ -193,17 +252,184 @@ vdev_raidz_exp2(uint_t a, int exp)
}
static void
-vdev_raidz_map_free(zio_t *zio)
+vdev_raidz_map_free(raidz_map_t *rm)
{
- raidz_map_t *rm = zio->io_vsd;
int c;
+ size_t size;
- for (c = 0; c < rm->rm_firstdatacol; c++)
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ if (rm->rm_col[c].rc_gdata != NULL)
+ zio_buf_free(rm->rm_col[c].rc_gdata,
+ rm->rm_col[c].rc_size);
+ }
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ if (rm->rm_datacopy != NULL)
+ zio_buf_free(rm->rm_datacopy, size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT3U(rm->rm_freed, ==, 0);
+ rm->rm_freed = 1;
+
+ if (rm->rm_reports == 0)
+ vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ vdev_raidz_map_free(rm);
}
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ size_t c = zcr->zcr_cbinfo;
+ size_t x;
+
+ const char *good = NULL;
+ const char *bad = rm->rm_col[c].rc_data;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ if (c < rm->rm_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical ZIO)
+ */
+ if (rm->rm_col[0].rc_gdata == NULL) {
+ char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ char *buf;
+
+ /*
+ * Set up the rm_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ bad_parity[x] = rm->rm_col[x].rc_data;
+ rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+ zio_buf_alloc(rm->rm_col[x].rc_size);
+ }
+
+ /* fill in the data columns from good_data */
+ buf = (char *)good_data;
+ for (; x < rm->rm_cols; x++) {
+ rm->rm_col[x].rc_data = buf;
+ buf += rm->rm_col[x].rc_size;
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity(rm);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rm->rm_firstdatacol; x++)
+ rm->rm_col[x].rc_data = bad_parity[x];
+
+ buf = rm->rm_datacopy;
+ for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+ rm->rm_col[x].rc_data = buf;
+ buf += rm->rm_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+ good = rm->rm_col[c].rc_gdata;
+ } else {
+ /* adjust good_data to point at the start of our column */
+ good = good_data;
+
+ for (x = rm->rm_firstdatacol; x < c; x++)
+ good += rm->rm_col[x].rc_size;
+ }
+
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ caddr_t buf;
+
+ raidz_map_t *rm = zio->io_vsd;
+ size_t size;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_raidz_cksum_finish;
+ zcr->zcr_free = vdev_raidz_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (rm->rm_datacopy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them.
+ */
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ buf = rm->rm_datacopy = zio_buf_alloc(size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bcopy(col->rc_data, buf, col->rc_size);
+ col->rc_data = buf;
+
+ buf += col->rc_size;
+ }
+ ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+ vdev_raidz_map_free_vsd,
+ vdev_raidz_cksum_report
+};
+
static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
@@ -213,24 +439,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t s = zio->io_size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, coff, devidx;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
q = s / (dcols - nparity);
r = s - q * (dcols - nparity);
bc = (r == 0 ? 0 : r + nparity);
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ if (q == 0) {
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
- acols = (q == 0 ? bc : dcols);
+ ASSERT3U(acols, <=, scols);
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
rm->rm_cols = acols;
+ rm->rm_scols = scols;
rm->rm_bigcols = bc;
- rm->rm_asize = 0;
+ rm->rm_skipstart = bc;
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
+ rm->rm_datacopy = NULL;
+ rm->rm_reports = 0;
+ rm->rm_freed = 0;
+ rm->rm_ecksuminjected = 0;
+
+ asize = 0;
- for (c = 0; c < acols; c++) {
+ for (c = 0; c < scols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
@@ -239,15 +481,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
rm->rm_col[c].rc_skipped = 0;
- rm->rm_asize += rm->rm_col[c].rc_size;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
}
- rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+ ASSERT3U(asize, ==, tot << unit_shift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+ ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -272,6 +526,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
* Unfortunately, this decision created an implicit on-disk format
* requirement that we need to support for all eternity, but only
* for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
*/
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
@@ -283,10 +542,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
rm->rm_col[1].rc_devidx = devidx;
rm->rm_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
}
zio->io_vsd = rm;
- zio->io_vsd_free = vdev_raidz_map_free;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -305,12 +567,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
if (c == rm->rm_firstdatacol) {
ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p = *src;
}
} else {
ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p ^= *src;
}
}
@@ -320,10 +582,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
int c;
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
@@ -331,55 +593,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
src = rm->rm_col[c].rc_data;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount || ccount == 0);
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- *q = *src;
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
*p = *src;
+ *q = *src;
}
- for (; i < pcount; i++, p++, q++, src++) {
- *q = 0;
+ for (; i < pcnt; i++, src++, p++, q++) {
*p = 0;
+ *q = 0;
}
} else {
- ASSERT(ccount <= pcount);
+ ASSERT(ccnt <= pcnt);
/*
- * Rather than multiplying each byte individually (as
- * described above), we are able to handle 8 at once
- * by generating a mask based on the high bit in each
- * byte and using that to conditionally XOR in 0x1d.
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
*/
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
+ *p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
*q ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (; i < pcnt; i++, q++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+ uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ int c;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+ *p = *src;
+ *q = *src;
+ *r = *src;
+ }
+ for (; i < pcnt; i++, src++, p++, q++, r++) {
+ *p = 0;
+ *q = 0;
+ *r = 0;
+ }
+ } else {
+ ASSERT(ccnt <= pcnt);
+
+ /*
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
+ */
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
*p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ *q ^= *src;
+
+ VDEV_RAIDZ_64MUL_4(*r, mask);
+ *r ^= *src;
}
/*
* Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
*/
- for (; i < pcount; i++, q++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (; i < pcnt; i++, q++, r++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ VDEV_RAIDZ_64MUL_4(*r, mask);
}
}
}
}
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ switch (rm->rm_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rm);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rm);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rm);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+ }
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, i;
+ int x = tgts[0];
int c;
+ ASSERT(ntgts == 1);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(x < rm->rm_cols);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
ASSERT(xcount > 0);
@@ -404,15 +749,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
*dst ^= *src;
}
}
+
+ return (1 << VDEV_RAIDZ_P);
}
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, mask, i;
uint8_t *b;
+ int x = tgts[0];
int c, j, exp;
+ ASSERT(ntgts == 1);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
@@ -436,23 +786,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
}
} else {
- /*
- * For an explanation of this, see the comment in
- * vdev_raidz_generate_parity_pq() above.
- */
for (i = 0; i < count; i++, dst++, src++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
*dst ^= *src;
}
for (; i < xcount; i++, dst++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
}
}
}
@@ -467,15 +807,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
*b = vdev_raidz_exp2(*b, exp);
}
}
+
+ return (1 << VDEV_RAIDZ_Q);
}
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
void *pdata, *qdata;
uint64_t xsize, ysize, i;
+ int x = tgts[0];
+ int y = tgts[1];
+ ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(y < rm->rm_cols);
@@ -553,15 +898,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
*/
rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ }
+
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
+
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+ jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
+
+ /*
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
+
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
+
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
+
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
+
+ /*
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ }
+ }
}
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t log, val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rm->rm_cols);
+
+ src = rm->rm_col[c].rc_data;
+ ccount = rm->rm_col[c].rc_size;
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rm->rm_firstdatacol;
+ ASSERT3U(cc, >=, rm->rm_firstdatacol);
+ ASSERT3U(cc, <, rm->rm_cols);
+ ASSERT3U(cc, !=, c);
+
+ dst[j] = rm->rm_col[cc].rc_data;
+ dcount[j] = rm->rm_col[cc].rc_size;
+ }
+
+ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+ uint8_t *p, *pp;
+ size_t psize;
+
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ int code = 0;
+
+
+ n = rm->rm_cols - rm->rm_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
+ */
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rm->rm_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rm->rm_firstdatacol;
+ }
+ }
+
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rm->rm_firstdatacol);
+
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
+
+ code |= 1 << c;
+
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rm->rm_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
+ }
+
+ /*
+ * Initialize the interesting rows of the matrix.
+ */
+ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+ int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+ int ntgts;
+ int i, c;
+ int code;
+ int nbadparity, nbaddata;
+ int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+ /*
+ * The tgts list must already be sorted.
+ */
+ for (i = 1; i < nt; i++) {
+ ASSERT(t[i] > t[i - 1]);
+ }
+
+ nbadparity = rm->rm_firstdatacol;
+ nbaddata = rm->rm_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rm->rm_cols; c++) {
+ if (c < rm->rm_firstdatacol)
+ parity_valid[c] = B_FALSE;
+
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rm->rm_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rm->rm_firstdatacol) {
+ nbaddata--;
+ } else {
+ parity_valid[c] = B_TRUE;
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ dt = &tgts[nbadparity];
+
+ /*
+ * See if we can use any of our optimized reconstruction routines.
+ */
+ if (!vdev_raidz_default_to_general) {
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+ break;
+
+ case 2:
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+
+ break;
+ }
+ }
+
+ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
- int c, error;
+ int c;
int lasterror = 0;
int numerrors = 0;
@@ -573,11 +1457,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
return (EINVAL);
}
+ vdev_open_children(vd);
+
for (c = 0; c < vd->vdev_children; c++) {
cvd = vd->vdev_child[c];
- if ((error = vdev_open(cvd)) != 0) {
- lasterror = error;
+ if (cvd->vdev_open_error != 0) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
@@ -636,10 +1522,9 @@ vdev_raidz_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
- blkptr_t *bp = zio->io_bp;
raidz_map_t *rm;
raidz_col_t *rc;
- int c;
+ int c, i;
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
@@ -647,13 +1532,7 @@ vdev_raidz_io_start(zio_t *zio)
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * Generate RAID parity in the first virtual columns.
- */
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
@@ -664,6 +1543,23 @@ vdev_raidz_io_start(zio_t *zio)
vdev_raidz_child_done, rc));
}
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+ ASSERT(c <= rm->rm_scols);
+ if (c == rm->rm_scols)
+ c = 0;
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -671,8 +1567,7 @@ vdev_raidz_io_start(zio_t *zio)
/*
* Iterate over the columns in reverse order so that we hit the parity
- * last -- any errors along the way will force us to read the parity
- * data.
+ * last -- any errors along the way will force us to read the parity.
*/
for (c = rm->rm_cols - 1; c >= 0; c--) {
rc = &rm->rm_col[c];
@@ -687,7 +1582,7 @@ vdev_raidz_io_start(zio_t *zio)
rc->rc_skipped = 1;
continue;
}
- if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
@@ -708,23 +1603,47 @@ vdev_raidz_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+
/*
* Report a checksum error for a child of a RAID-Z device.
*/
static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
+
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+ &zbc);
}
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret != 0 && zbc.zbc_injected != 0)
+ rm->rm_ecksuminjected = 1;
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+ return (ret);
}
/*
@@ -748,17 +1667,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
bcopy(rc->rc_data, orig[c], rc->rc_size);
}
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
- raidz_checksum_error(zio, rc);
+ raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = ECKSUM;
ret++;
}
@@ -768,9 +1684,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
return (ret);
}
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
vdev_raidz_worst_error(raidz_map_t *rm)
@@ -783,19 +1700,177 @@ vdev_raidz_worst_error(raidz_map_t *rm)
return (error);
}
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *tgts = &tstore[1];
+ int current, next, i, c, n;
+ int code, ret = 0;
+
+ ASSERT(total_errors < rm->rm_firstdatacol);
+
+ /*
+ * This simplifies one edge condition.
+ */
+ tgts[-1] = -1;
+
+ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ /*
+ * Initialize the targets array by finding the first n columns
+ * that contain no error.
+ *
+ * If there were no data errors, we need to ensure that we're
+ * always explicitly attempting to reconstruct at least one
+ * data column. To do this, we simply push the highest target
+ * up into the data columns.
+ */
+ for (c = 0, i = 0; i < n; i++) {
+ if (i == n - 1 && data_errors == 0 &&
+ c < rm->rm_firstdatacol) {
+ c = rm->rm_firstdatacol;
+ }
+
+ while (rm->rm_col[c].rc_error != 0) {
+ c++;
+ ASSERT3S(c, <, rm->rm_cols);
+ }
+
+ tgts[i] = c++;
+ }
+
+ /*
+ * Setting tgts[n] simplifies the other edge condition.
+ */
+ tgts[n] = rm->rm_cols;
+
+ /*
+ * These buffers were allocated in previous iterations.
+ */
+ for (i = 0; i < n - 1; i++) {
+ ASSERT(orig[i] != NULL);
+ }
+
+ orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+ current = 0;
+ next = tgts[current];
+
+ while (current != n) {
+ tgts[current] = next;
+ current = 0;
+
+ /*
+ * Save off the original data that we're going to
+ * attempt to reconstruct.
+ */
+ for (i = 0; i < n; i++) {
+ ASSERT(orig[i] != NULL);
+ c = tgts[i];
+ ASSERT3S(c, >=, 0);
+ ASSERT3S(c, <, rm->rm_cols);
+ rc = &rm->rm_col[c];
+ bcopy(rc->rc_data, orig[i], rc->rc_size);
+ }
+
+ /*
+ * Attempt a reconstruction and exit the outer loop on
+ * success.
+ */
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
+
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_error == 0);
+ if (rc->rc_tried)
+ raidz_checksum_error(zio, rc,
+ orig[i]);
+ rc->rc_error = ECKSUM;
+ }
+
+ ret = code;
+ goto done;
+ }
+
+ /*
+ * Restore the original data.
+ */
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ bcopy(orig[i], rc->rc_data, rc->rc_size);
+ }
+
+ do {
+ /*
+ * Find the next valid column after the current
+ * position..
+ */
+ for (next = tgts[current] + 1;
+ next < rm->rm_cols &&
+ rm->rm_col[next].rc_error != 0; next++)
+ continue;
+
+ ASSERT(next <= tgts[current + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ */
+ if (next != tgts[current + 1])
+ break;
+
+ /*
+ * Otherwise, find the next valid column after
+ * the previous position.
+ */
+ for (c = tgts[current - 1] + 1;
+ rm->rm_col[c].rc_error != 0; c++)
+ continue;
+
+ tgts[current] = c;
+ current++;
+
+ } while (current != n);
+ }
+ }
+ n--;
+done:
+ for (i = 0; i < n; i++) {
+ zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+ }
+
+ return (ret);
+}
+
static void
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_t *cvd;
raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc, *rc1;
+ raidz_col_t *rc;
int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c, c1;
+ int n, c;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int code;
ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
@@ -859,9 +1934,8 @@ vdev_raidz_io_done(zio_t *zio)
* any errors.
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- switch (data_errors) {
- case 0:
- if (zio_checksum_error(zio) == 0) {
+ if (data_errors == 0) {
+ if (raidz_checksum_verify(zio) == 0) {
/*
* If we read parity information (unnecessarily
* as it happens since no reconstruction was
@@ -880,9 +1954,7 @@ vdev_raidz_io_done(zio_t *zio)
}
goto done;
}
- break;
-
- case 1:
+ } else {
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
@@ -894,45 +1966,38 @@ vdev_raidz_io_done(zio_t *zio)
ASSERT(parity_errors < rm->rm_firstdatacol);
/*
- * Find the column that reported the error.
+ * Identify the data columns that reported an error.
*/
+ n = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
}
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- vdev_raidz_reconstruct_p(rm, c);
- } else {
- ASSERT(rm->rm_firstdatacol > 1);
- vdev_raidz_reconstruct_q(rm, c);
- }
+ ASSERT(rm->rm_firstdatacol >= n);
- if (zio_checksum_error(zio) == 0) {
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
- atomic_inc_64(&raidz_corrected_p);
- else
- atomic_inc_64(&raidz_corrected_q);
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
/*
- * If there's more than one parity disk that
- * was successfully read, confirm that the
- * other parity disk produced the correct data.
- * This routine is suboptimal in that it
- * regenerates both the parity we wish to test
- * as well as the parity we just used to
- * perform the reconstruction, but this should
- * be a relatively uncommon case, and can be
- * optimized if it becomes a problem.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
+ * If we read more parity disks than were used
+ * for reconstruction, confirm that the other
+ * parity disks produced correct data. This
+ * routine is suboptimal in that it regenerates
+ * the parity that we already used in addition
+ * to the parity that we're attempting to
+ * verify, but this should be a relatively
+ * uncommon case, and can be optimized if it
+ * becomes a problem. Note that we regenerate
+ * parity when resilvering so we can write it
+ * out to failed devices later.
*/
- if (parity_errors < rm->rm_firstdatacol - 1 ||
+ if (parity_errors < rm->rm_firstdatacol - n ||
(zio->io_flags & ZIO_FLAG_RESILVER)) {
n = raidz_parity_verify(zio, rm);
unexpected_errors += n;
@@ -942,46 +2007,6 @@ vdev_raidz_io_done(zio_t *zio)
goto done;
}
- break;
-
- case 2:
- /*
- * Two data column errors require double parity.
- */
- ASSERT(rm->rm_firstdatacol == 2);
-
- /*
- * Find the two columns that reported errors.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- for (c1 = c++; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- vdev_raidz_reconstruct_pq(rm, c1, c);
-
- if (zio_checksum_error(zio) == 0) {
- atomic_inc_64(&raidz_corrected_pq);
- goto done;
- }
- break;
-
- default:
- ASSERT(rm->rm_firstdatacol <= 2);
- ASSERT(0);
}
}
@@ -1020,145 +2045,54 @@ vdev_raidz_io_done(zio_t *zio)
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. Before we attempt combinatorial reconstruction make
- * sure we have a chance of coming up with the right answer.
+ * in absent data. We check if there is enough additional data to
+ * possibly reconstruct the data and then perform combinatorial
+ * reconstruction over all possible combinations. If that fails,
+ * we're cooked.
*/
- if (total_errors >= rm->rm_firstdatacol) {
+ if (total_errors > rm->rm_firstdatacol) {
zio->io_error = vdev_raidz_worst_error(rm);
- /*
- * If there were exactly as many device errors as parity
- * columns, yet we couldn't reconstruct the data, then at
- * least one device must have returned bad data silently.
- */
- if (total_errors == rm->rm_firstdatacol)
- zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
- goto done;
- }
-
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from parity P.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_p(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_p);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
- if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ } else if (total_errors < rm->rm_firstdatacol &&
+ (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
/*
- * Attempt to reconstruct the data from parity Q.
+ * If we didn't use all the available parity for the
+ * combinatorial reconstruction, verify that the remaining
+ * parity is correct.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_q(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_q);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- if (rm->rm_firstdatacol > 1 &&
- rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
- rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ if (code != (1 << rm->rm_firstdatacol) - 1)
+ (void) raidz_parity_verify(zio, rm);
+ } else {
/*
- * Attempt to reconstruct the data from both P and Q.
+ * We're here because either:
+ *
+ * total_errors == rm_first_datacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
- void *orig, *orig1;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
-
- for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
- rc1 = &rm->rm_col[c1];
-
- orig1 = zio_buf_alloc(rc1->rc_size);
- bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
- vdev_raidz_reconstruct_pq(rm, c, c1);
+ zio->io_error = ECKSUM;
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
- atomic_inc_64(&raidz_corrected_pq);
-
- /*
- * If these children didn't know they
- * returned bad data, inform them.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- if (rc1->rc_tried && rc1->rc_error == 0)
- raidz_checksum_error(zio, rc1);
-
- rc->rc_error = ECKSUM;
- rc1->rc_error = ECKSUM;
-
- goto done;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
+
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
}
-
- bcopy(orig1, rc1->rc_data, rc1->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
}
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- /*
- * All combinations failed to checksum. Generate checksum ereports for
- * all children.
- */
- zio->io_error = ECKSUM;
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
- rc->rc_offset, rc->rc_size);
}
}
@@ -1205,6 +2139,8 @@ vdev_ops_t vdev_raidz_ops = {
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
index 88383f002b80..879f78f3a5b3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors)
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- int c;
int lasterror = 0;
int numerrors = 0;
@@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
- int error;
- if ((error = vdev_open(cvd)) != 0 &&
- !cvd->vdev_islog) {
- lasterror = error;
+ if (cvd->vdev_open_error && !cvd->vdev_islog) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
- continue;
}
}
@@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
static void
vdev_root_close(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
@@ -113,6 +109,8 @@ vdev_ops_t vdev_root_ops = {
NULL, /* io_start - not applicable to the root */
NULL, /* io_done - not applicable to the root */
vdev_root_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 7abe63ac917d..288a4d99ab25 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -19,13 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* This file contains the top half of the zfs directory structure
* implementation. The bottom half is in zap_leaf.c.
@@ -45,11 +41,11 @@
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/zfs_znode.h>
int fzap_default_block_shift = 14; /* 16k blocksize */
@@ -73,7 +69,7 @@ fzap_byteswap(void *vbuf, size_t size)
}
void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
{
dmu_buf_t *db;
zap_leaf_t *l;
@@ -86,7 +82,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
&zap->zap_f.zap_phys, zap_evict);
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
zp = zap->zap_f.zap_phys;
@@ -105,6 +101,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
zp->zap_num_entries = 0;
zp->zap_salt = zap->zap_salt;
zp->zap_normflags = zap->zap_normflags;
+ zp->zap_flags = flags;
/* block 1 will be the first leaf */
for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -114,7 +111,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
* set up block 1 - the first leaf
*/
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -175,20 +172,20 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
b = tbl->zt_blks_copied;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
if (err)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+0) << bs, FTAG, &db_new));
+ (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+1) << bs, FTAG, &db_new));
+ (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
db_new->db_data, hepb);
@@ -236,7 +233,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
dmu_buf_will_dirty(db, tx);
@@ -248,7 +245,8 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
dmu_buf_t *db2;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+ DMU_READ_NO_PREFETCH);
if (err) {
dmu_buf_rele(db, FTAG);
return (err);
@@ -279,7 +277,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@@ -294,7 +292,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
blk = (idx*2) >> (bs-3);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+ DMU_READ_NO_PREFETCH);
dmu_buf_rele(db, FTAG);
}
return (err);
@@ -318,8 +317,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
static int
zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
{
- /* In case things go horribly wrong. */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ /*
+ * The pointer table should never use more hash bits than we
+ * have (otherwise we'd be using useless zero bits to index it).
+ * If we are within 2 bits of running out, stop growing, since
+ * this is already an aberrant condition.
+ */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
return (ENOSPC);
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -338,7 +342,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
newblk = zap_allocate_blocks(zap, 1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+ DMU_READ_NO_PREFETCH);
if (err)
return (err);
dmu_buf_will_dirty(db_new, tx);
@@ -389,14 +394,15 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
l->l_phys = NULL;
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+ DMU_READ_NO_PREFETCH));
winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
ASSERT(winner == NULL);
dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -447,7 +453,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
ASSERT(blkid != 0);
l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = blkid;
l->l_bs = highbit(db->db_size)-1;
@@ -499,7 +505,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- blkid << bs, NULL, &db);
+ blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
@@ -703,13 +709,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
}
}
-
static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checkname(zap_name_t *zn)
{
- if (name && strlen(name) > ZAP_MAXNAMELEN)
- return (E2BIG);
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+ return (0);
+}
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
/* Only integer sizes supported by C */
switch (integer_size) {
case 1:
@@ -727,6 +737,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
return (0);
}
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+ int err;
+
+ if ((err = fzap_checkname(zn)) != 0)
+ return (err);
+ return (fzap_checksize(integer_size, num_integers));
+}
+
/*
* Routines for manipulating attributes.
*/
@@ -739,8 +759,7 @@ fzap_lookup(zap_name_t *zn,
int err;
zap_entry_handle_t zeh;
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
- if (err != 0)
+ if ((err = fzap_checkname(zn)) != 0)
return (err);
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -748,8 +767,13 @@ fzap_lookup(zap_name_t *zn,
return (err);
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
+ if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+ zap_put_leaf(l);
+ return (err);
+ }
+
err = zap_entry_read(&zeh, integer_size, num_integers, buf);
- (void) zap_entry_read_name(&zeh, rn_len, realname);
+ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
if (ncp) {
*ncp = zap_entry_normalization_conflict(&zeh,
zn, NULL, zn->zn_zap);
@@ -772,8 +796,7 @@ fzap_add_cd(zap_name_t *zn,
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(!zap->zap_ismicro);
- ASSERT(fzap_checksize(zn->zn_name_orij,
- integer_size, num_integers) == 0);
+ ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
@@ -787,7 +810,7 @@ retry:
if (err != ENOENT)
goto out;
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+ err = zap_entry_create(l, zn, cd,
integer_size, num_integers, val, &zeh);
if (err == 0) {
@@ -810,12 +833,12 @@ fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
- int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ int err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
- val, ZAP_MAXCD, tx));
+ val, ZAP_NEED_CD, tx));
}
int
@@ -828,7 +851,7 @@ fzap_update(zap_name_t *zn,
zap_t *zap = zn->zn_zap;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
return (err);
@@ -841,8 +864,8 @@ retry:
ASSERT(err == 0 || err == ENOENT);
if (create) {
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
- ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+ err = zap_entry_create(l, zn, ZAP_NEED_CD,
+ integer_size, num_integers, val, &zeh);
if (err == 0)
zap_increment_num_entries(zap, 1, tx);
} else {
@@ -904,6 +927,21 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
return (err);
}
+void
+fzap_prefetch(zap_name_t *zn)
+{
+ uint64_t idx, blk;
+ zap_t *zap = zn->zn_zap;
+ int bs;
+
+ idx = ZAP_HASH_IDX(zn->zn_hash,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+}
+
/*
* Helper functions for consumers.
*/
@@ -955,6 +993,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
}
int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+ err = zap_add(os, intoobj, za.za_name,
+ 8, 1, &value, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ uint64_t delta = 0;
+
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+
+ err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ delta += za.za_first_integer;
+ err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
{
char name[20];
@@ -981,6 +1069,56 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
return (zap_lookup(os, obj, name, 8, 1, &value));
}
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx)
+{
+ uint64_t value = 0;
+ int err;
+
+ if (delta == 0)
+ return (0);
+
+ err = zap_lookup(os, obj, name, 8, 1, &value);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ value += delta;
+ if (value == 0)
+ err = zap_remove(os, obj, name, tx);
+ else
+ err = zap_update(os, obj, name, 8, 1, &value, tx);
+ return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_increment(os, obj, name, delta, tx));
+}
+
/*
* Routines for iterating over the attributes.
*/
@@ -1042,7 +1180,7 @@ again:
err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
ASSERT(err == 0 || err == EOVERFLOW);
}
- err = zap_entry_read_name(&zeh,
+ err = zap_entry_read_name(zap, &zeh,
sizeof (za->za_name), za->za_name);
ASSERT(err == 0);
@@ -1054,7 +1192,6 @@ again:
return (err);
}
-
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
@@ -1081,6 +1218,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
}
}
+int
+fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
+{
+ int err;
+ zap_leaf_t *l;
+ zap_entry_handle_t zeh;
+
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ return (err);
+
+ zc->zc_leaf = l;
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+
+ return (err);
+}
+
void
fzap_get_stats(zap_t *zap, zap_stats_t *zs)
{
@@ -1126,7 +1288,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
- FTAG, &db);
+ FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {
zap_stats_ptrtbl(zap, db->db_data,
1<<(bs-3), zs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index da498b6bc9e3..19a795db825b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -19,24 +19,24 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* The 512-byte leaf is broken into 32 16-byte chunks.
* chunk number n means l_chunk[n], even though the header precedes it.
* the names are stored null-terminated.
*/
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
+#include <sys/arc.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
@@ -127,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
le = &lc->l_entry;
le->le_type = BSWAP_8(le->le_type);
- le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_value_intlen = BSWAP_8(le->le_value_intlen);
le->le_next = BSWAP_16(le->le_next);
le->le_name_chunk = BSWAP_16(le->le_name_chunk);
- le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_name_numints = BSWAP_16(le->le_name_numints);
le->le_value_chunk = BSWAP_16(le->le_value_chunk);
- le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_value_numints = BSWAP_16(le->le_value_numints);
le->le_cd = BSWAP_32(le->le_cd);
le->le_hash = BSWAP_64(le->le_hash);
break;
@@ -215,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
static uint16_t
zap_leaf_array_create(zap_leaf_t *l, const char *buf,
- int integer_size, int num_integers)
+ int integer_size, int num_integers)
{
uint16_t chunk_head;
uint16_t *chunkp = &chunk_head;
@@ -273,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
static void
zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
- char *buf)
+ void *buf)
{
int len = MIN(array_len, buf_len);
int byten = 0;
uint64_t value = 0;
+ char *p = buf;
ASSERT3U(array_int_len, <=, buf_int_len);
@@ -285,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
uint8_t *ip = la->la_array;
- uint64_t *buf64 = (uint64_t *)buf;
+ uint64_t *buf64 = buf;
*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
(uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -300,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
while (chunk != CHAIN_END) {
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
- bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
- buf += ZAP_LEAF_ARRAY_BYTES;
+ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ p += ZAP_LEAF_ARRAY_BYTES;
chunk = la->la_next;
}
return;
@@ -316,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
- stv(buf_int_len, buf, value);
+ stv(buf_int_len, p, value);
byten = 0;
len--;
if (len == 0)
return;
- buf += buf_int_len;
+ p += buf_int_len;
}
}
chunk = la->la_next;
}
}
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * namenorm is null-terminated.
- */
static boolean_t
-zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+ int chunk, int array_numints)
{
int bseen = 0;
+ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+ uint64_t *thiskey;
+ boolean_t match;
+
+ ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+ thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
+ KM_SLEEP);
+
+ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+ sizeof (*thiskey), array_numints, thiskey);
+ match = bcmp(thiskey, zn->zn_key_orig,
+ array_numints * sizeof (*thiskey)) == 0;
+ kmem_free(thiskey, array_numints * sizeof (*thiskey));
+ return (match);
+ }
+
+ ASSERT(zn->zn_key_intlen == 1);
if (zn->zn_matchtype == MT_FIRST) {
- char *thisname = kmem_alloc(array_len, KM_SLEEP);
+ char *thisname = kmem_alloc(array_numints, KM_SLEEP);
boolean_t match;
- zap_leaf_array_read(l, chunk, 1, array_len, 1,
- array_len, thisname);
+ zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+ sizeof (char), array_numints, thisname);
match = zap_match(zn, thisname);
- kmem_free(thisname, array_len);
+ kmem_free(thisname, array_numints);
return (match);
}
- /* Fast path for exact matching */
- while (bseen < array_len) {
+ /*
+ * Fast path for exact matching.
+ * First check that the lengths match, so that we don't read
+ * past the end of the zn_key_orig array.
+ */
+ if (array_numints != zn->zn_key_orig_numints)
+ return (B_FALSE);
+ while (bseen < array_numints) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
break;
chunk = la->la_next;
bseen += toread;
}
- return (bseen == array_len);
+ return (bseen == array_numints);
}
/*
@@ -394,9 +414,9 @@ again:
ASSERT(zn->zn_matchtype == MT_EXACT ||
(l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
if (zap_leaf_array_match(l, zn, le->le_name_chunk,
- le->le_name_length)) {
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
+ le->le_name_numints)) {
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
@@ -427,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
{
uint16_t chunk;
uint64_t besth = -1ULL;
- uint32_t bestcd = ZAP_MAXCD;
+ uint32_t bestcd = -1U;
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
uint16_t lh;
struct zap_leaf_entry *le;
@@ -449,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
besth = le->le_hash;
bestcd = le->le_cd;
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_fakechunk = chunk;
@@ -460,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
}
}
- return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+ return (bestcd == -1U ? ENOENT : 0);
}
int
@@ -471,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh,
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- if (le->le_int_size > integer_size)
+ if (le->le_value_intlen > integer_size)
return (EINVAL);
- zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
- le->le_value_length, integer_size, num_integers, buf);
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+ le->le_value_intlen, le->le_value_numints,
+ integer_size, num_integers, buf);
if (zeh->zeh_num_integers > num_integers)
return (EOVERFLOW);
@@ -484,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh,
}
int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+ char *buf)
{
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
- le->le_name_length, 1, buflen, buf);
- if (le->le_name_length > buflen)
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+ le->le_name_numints, 8, buflen / 8, buf);
+ } else {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_numints, 1, buflen, buf);
+ }
+ if (le->le_name_numints > buflen)
return (EOVERFLOW);
return (0);
}
@@ -506,24 +533,16 @@ zap_entry_update(zap_entry_handle_t *zeh,
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
return (EAGAIN);
- /*
- * We should search other chained leaves (via
- * zap_entry_remove,create?) otherwise returning EAGAIN will
- * just send us into an infinite loop if we have to chain
- * another leaf block, rather than being able to split this
- * block.
- */
-
zap_leaf_array_free(l, &le->le_value_chunk);
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
return (0);
}
@@ -550,26 +569,25 @@ zap_entry_remove(zap_entry_handle_t *zeh)
}
int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
uint8_t integer_size, uint64_t num_integers, const void *buf,
zap_entry_handle_t *zeh)
{
uint16_t chunk;
uint16_t *chunkp;
struct zap_leaf_entry *le;
- uint64_t namelen, valuelen;
+ uint64_t valuelen;
int numchunks;
+ uint64_t h = zn->zn_hash;
valuelen = integer_size * num_integers;
- namelen = strlen(name) + 1;
- ASSERT(namelen >= 2);
- numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
- ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (E2BIG);
- if (cd == ZAP_MAXCD) {
+ if (cd == ZAP_NEED_CD) {
/* find the lowest unused cd */
if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
cd = 0;
@@ -586,7 +604,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
}
} else {
/* old unsorted format; do it the O(n^2) way */
- for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (cd = 0; ; cd++) {
for (chunk = *LEAF_HASH_ENTPTR(l, h);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -601,10 +619,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
}
}
/*
- * we would run out of space in a block before we could
- * have ZAP_MAXCD entries
+ * We would run out of space in a block before we could
+ * store enough entries to run out of CD values.
*/
- ASSERT3U(cd, <, ZAP_MAXCD);
+ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
}
if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -614,12 +632,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
chunk = zap_leaf_chunk_alloc(l);
le = ZAP_LEAF_ENTRY(l, chunk);
le->le_type = ZAP_CHUNK_ENTRY;
- le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
- le->le_name_length = namelen;
+ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+ zn->zn_key_intlen, zn->zn_key_orig_numints);
+ le->le_name_numints = zn->zn_key_orig_numints;
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
le->le_hash = h;
le->le_cd = cd;
@@ -631,7 +650,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
zeh->zeh_leaf = l;
zeh->zeh_num_integers = num_integers;
- zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_integer_size = le->le_value_intlen;
zeh->zeh_cd = le->le_cd;
zeh->zeh_hash = le->le_hash;
zeh->zeh_chunkp = chunkp;
@@ -673,7 +692,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
allocdzn = B_TRUE;
}
if (zap_leaf_array_match(zeh->zeh_leaf, zn,
- le->le_name_chunk, le->le_name_length)) {
+ le->le_name_chunk, le->le_name_numints)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
@@ -836,9 +855,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(l, chunk);
- n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
- le->le_int_size);
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+ le->le_value_intlen);
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_entries_using_n_chunks[n]++;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 9453fd293870..b40309741dfa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
@@ -33,38 +31,98 @@
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/avl.h>
+#include <sys/arc.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
#endif
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+uint64_t
+zap_getflags(zap_t *zap)
+{
+ if (zap->zap_ismicro)
+ return (0);
+ return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return (48);
+ else
+ return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return ((1<<16)-1);
+ else
+ return (-1U);
+}
static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+zap_hash(zap_name_t *zn)
{
- const uint8_t *cp;
- uint8_t c;
- uint64_t crc = zap->zap_salt;
+ zap_t *zap = zn->zn_zap;
+ uint64_t h = 0;
- /* NB: name must already be normalized, if necessary */
+ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+ h = *(uint64_t *)zn->zn_key_orig;
+ } else {
+ h = zap->zap_salt;
+ ASSERT(h != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ int i;
+ const uint64_t *wp = zn->zn_key_norm;
+
+ ASSERT(zn->zn_key_intlen == 8);
+ for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
+ int j;
+ uint64_t word = *wp;
+
+ for (j = 0; j < zn->zn_key_intlen; j++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ word) & 0xFF];
+ word >>= NBBY;
+ }
+ }
+ } else {
+ int i, len;
+ const uint8_t *cp = zn->zn_key_norm;
- ASSERT(crc != 0);
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
- }
+ /*
+ * We previously stored the terminating null on
+ * disk, but didn't hash it, so we need to
+ * continue to not hash it. (The
+ * zn_key_*_numints includes the terminating
+ * null for non-binary keys.)
+ */
+ len = zn->zn_key_norm_numints - 1;
+ ASSERT(zn->zn_key_intlen == 1);
+ for (i = 0; i < len; cp++, i++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ *cp) & 0xFF];
+ }
+ }
+ }
/*
- * Only use 28 bits, since we need 4 bits in the cookie for the
- * collision differentiator. We MUST use the high bits, since
- * those are the ones that we first pay attention to when
+ * Don't use all 64 bits, since we need some in the cookie for
+ * the collision differentiator. We MUST use the high bits,
+ * since those are the ones that we first pay attention to when
* chosing the bucket.
*/
- crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
- return (crc);
+ return (h);
}
static int
@@ -73,13 +131,15 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
size_t inlen, outlen;
int err;
+ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
inlen = strlen(name) + 1;
outlen = ZAP_MAXNAMELEN;
err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
- zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
- &err);
+ zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
+ U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
return (err);
}
@@ -87,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
boolean_t
zap_match(zap_name_t *zn, const char *matchname)
{
+ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
if (zn->zn_matchtype == MT_FIRST) {
char norm[ZAP_MAXNAMELEN];
if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
return (B_FALSE);
- return (strcmp(zn->zn_name_norm, norm) == 0);
+ return (strcmp(zn->zn_key_norm, norm) == 0);
} else {
/* MT_BEST or MT_EXACT */
- return (strcmp(zn->zn_name_orij, matchname) == 0);
+ return (strcmp(zn->zn_key_orig, matchname) == 0);
}
}
@@ -106,30 +168,49 @@ zap_name_free(zap_name_t *zn)
kmem_free(zn, sizeof (zap_name_t));
}
-/* XXX combine this with zap_lockdir()? */
zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
zn->zn_zap = zap;
- zn->zn_name_orij = name;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = key;
+ zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
zn->zn_matchtype = mt;
if (zap->zap_normflags) {
- if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+ if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_normbuf;
+ zn->zn_key_norm = zn->zn_normbuf;
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else {
if (mt != MT_EXACT) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_name_orij;
+ zn->zn_key_norm = zn->zn_key_orig;
+ zn->zn_key_norm_numints = zn->zn_key_orig_numints;
}
- zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+ zn->zn_hash = zap_hash(zn);
+ return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ ASSERT(zap->zap_normflags == 0);
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = zn->zn_key_norm = key;
+ zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+ zn->zn_matchtype = MT_EXACT;
+
+ zn->zn_hash = zap_hash(zn);
return (zn);
}
@@ -174,27 +255,27 @@ mze_compare(const void *arg1, const void *arg2)
return (+1);
if (mze1->mze_hash < mze2->mze_hash)
return (-1);
- if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd > mze2->mze_cd)
return (+1);
- if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd < mze2->mze_cd)
return (-1);
return (0);
}
static int
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
{
mzap_ent_t *mze;
avl_index_t idx;
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(mzep->mze_cd < ZAP_MAXCD);
mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
mze->mze_hash = hash;
- mze->mze_phys = *mzep;
+ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
kmem_free(mze, sizeof (mzap_ent_t));
return (EEXIST);
@@ -214,18 +295,16 @@ mze_find(zap_name_t *zn)
ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
- if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
- return (NULL);
-
mze_tofind.mze_hash = zn->zn_hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
again:
mze = avl_find(avl, &mze_tofind, &idx);
if (mze == NULL)
mze = avl_nearest(avl, idx, AVL_AFTER);
for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
- if (zap_match(zn, mze->mze_phys.mze_name))
+ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze);
}
if (zn->zn_matchtype == MT_BEST) {
@@ -248,12 +327,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
mze_tofind.mze_hash = hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
cd = 0;
for (mze = avl_find(avl, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
- if (mze->mze_phys.mze_cd != cd)
+ if (mze->mze_cd != cd)
break;
cd++;
}
@@ -292,15 +371,14 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
- rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
+ rw_init(&zap->zap_rwlock, 0, 0, 0);
rw_enter(&zap->zap_rwlock, RW_WRITER);
zap->zap_objset = os;
zap->zap_object = obj;
zap->zap_dbuf = db;
if (*(uint64_t *)db->db_data != ZBT_MICRO) {
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
- MUTEX_DEFAULT, 0);
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
} else {
zap->zap_ismicro = TRUE;
@@ -337,7 +415,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zn = zap_name_alloc(zap, mze->mze_name,
MT_EXACT);
- if (mze_insert(zap, i, zn->zn_hash, mze) == 0)
+ if (mze_insert(zap, i, zn->zn_hash) == 0)
zap->zap_m.zap_num_entries++;
else {
printf("ZFS WARNING: Duplicated ZAP "
@@ -385,7 +463,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
*zapp = NULL;
- err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
@@ -435,7 +513,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
- return (mzap_upgrade(zapp, tx));
+ return (mzap_upgrade(zapp, tx, 0));
}
err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
ASSERT3U(err, ==, 0);
@@ -455,10 +533,11 @@ zap_unlockdir(zap_t *zap)
}
static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
- int i, sz, nchunks, err;
+ int i, sz, nchunks;
+ int err = 0;
zap_t *zap = *zapp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -468,11 +547,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
- err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
- 1ULL << fzap_default_block_shift, 0, tx);
- if (err) {
- kmem_free(mzp, sz);
- return (err);
+ if (!flags) {
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err) {
+ kmem_free(mzp, sz);
+ return (err);
+ }
}
dprintf("upgrading obj=%llu with %u chunks\n",
@@ -480,10 +561,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
/* XXX destroy the avl later, so we can use the stored hash value */
mze_destroy(zap);
- fzap_upgrade(zap, tx);
+ fzap_upgrade(zap, tx, flags);
for (i = 0; i < nchunks; i++) {
- int err;
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
zap_name_t *zn;
if (mze->mze_name[0] == 0)
@@ -503,12 +583,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
}
static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+ dmu_tx_t *tx)
{
dmu_buf_t *db;
mzap_phys_t *zp;
- VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
#ifdef ZFS_DEBUG
{
@@ -524,6 +605,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
zp->mz_normflags = normflags;
dmu_buf_rele(db, FTAG);
+
+ if (flags != 0) {
+ zap_t *zap;
+ /* Only fat zap supports flags; upgrade immediately. */
+ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+ zap_unlockdir(zap);
+ }
}
int
@@ -544,7 +634,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
if (err != 0)
return (err);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
return (0);
}
@@ -561,7 +651,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
{
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
+ return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+ leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+ indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+ VERIFY(dmu_object_set_blocksize(os, obj,
+ 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+ mzap_create_impl(os, obj, normflags, flags, tx);
return (obj);
}
@@ -631,11 +740,11 @@ again:
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
if (zn == NULL) {
- zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
MT_FIRST);
allocdzn = B_TRUE;
}
- if (zap_match(zn, other->mze_phys.mze_name)) {
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
@@ -697,9 +806,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
} else if (integer_size != 8) {
err = EINVAL;
} else {
- *(uint64_t *)buf = mze->mze_phys.mze_value;
+ *(uint64_t *)buf =
+ MZE_PHYS(zap, mze)->mze_value;
(void) strlcpy(realname,
- mze->mze_phys.mze_name, rn_len);
+ MZE_PHYS(zap, mze)->mze_name, rn_len);
if (ncp) {
*ncp = mzap_normalization_conflict(zap,
zn, mze);
@@ -713,6 +823,63 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ fzap_prefetch(zn);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ NULL, 0, NULL);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+ int err = (zap_lookup_norm(os, zapobj, name, 0,
+ 0, NULL, MT_EXACT, NULL, 0, NULL));
+ if (err == EOVERFLOW || err == EINVAL)
+ err = 0; /* found, but skipped reading the value */
+ return (err);
+}
+
+int
zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers)
{
@@ -747,6 +914,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_length(zn, integer_size, num_integers);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
@@ -755,20 +944,18 @@ mzap_addent(zap_name_t *zn, uint64_t value)
int start = zap->zap_m.zap_alloc_next;
uint32_t cd;
- dprintf("obj=%llu %s=%llu\n", zap->zap_object,
- zn->zn_name_orij, value);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
#ifdef ZFS_DEBUG
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
- ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
cd = mze_find_unused_cd(zap, zn->zn_hash);
/* given the limited size of the microzap, this can't happen */
- ASSERT(cd != ZAP_MAXCD);
+ ASSERT(cd < zap_maxcd(zap));
again:
for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -776,13 +963,13 @@ again:
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
mze->mze_cd = cd;
- (void) strcpy(mze->mze_name, zn->zn_name_orij);
+ (void) strcpy(mze->mze_name, zn->zn_key_orig);
zap->zap_m.zap_num_entries++;
zap->zap_m.zap_alloc_next = i+1;
if (zap->zap_m.zap_alloc_next ==
zap->zap_m.zap_num_chunks)
zap->zap_m.zap_alloc_next = 0;
- VERIFY(0 == mze_insert(zap, i, zn->zn_hash, mze));
+ VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
return;
}
}
@@ -794,7 +981,7 @@ again:
}
int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
@@ -807,7 +994,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
- zn = zap_name_alloc(zap, name, MT_EXACT);
+ zn = zap_name_alloc(zap, key, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
return (ENOTSUP);
@@ -816,10 +1003,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
- strlen(name) >= MZAP_NAME_LEN) {
- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
- zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ strlen(key) >= MZAP_NAME_LEN) {
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
@@ -839,15 +1024,50 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
mzap_ent_t *mze;
+ uint64_t oldval;
const uint64_t *intval = val;
zap_name_t *zn;
int err;
+#ifdef ZFS_DEBUG
+ /*
+ * If there is an old value, it shouldn't change across the
+ * lockdir (eg, due to bprewrite's xlation).
+ */
+ if (integer_size == 8 && num_integers == 1)
+ (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
@@ -863,7 +1083,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_update(zn, integer_size, num_integers,
val, tx);
@@ -871,9 +1091,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
} else {
mze = mze_find(zn);
if (mze != NULL) {
- mze->mze_phys.mze_value = *intval;
- zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid].mze_value = *intval;
+ ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+ MZE_PHYS(zap, mze)->mze_value = *intval;
} else {
mzap_addent(zn, *intval);
}
@@ -886,6 +1105,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
}
int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ zap_name_t *zn;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_update(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
{
return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
@@ -926,17 +1170,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
/*
* Routines for iterating over the attributes.
*/
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this. So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
@@ -945,15 +1204,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
zc->zc_zapobj = zapobj;
- if (serialized == -1ULL) {
- zc->zc_hash = -1ULL;
- zc->zc_cd = 0;
- } else {
- zc->zc_hash = serialized << (64-ZAP_HASHBITS);
- zc->zc_cd = serialized >> ZAP_HASHBITS;
- if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
- zc->zc_cd = 0;
- }
+ zc->zc_serialized = serialized;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
}
void
@@ -983,10 +1236,21 @@ zap_cursor_serialize(zap_cursor_t *zc)
{
if (zc->zc_hash == -1ULL)
return (-1ULL);
- ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
- ASSERT(zc->zc_cd < ZAP_MAXCD);
- return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
- ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+ if (zc->zc_zap == NULL)
+ return (zc->zc_serialized);
+ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+ /*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So usually use a small
+ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+ * of the cursor.
+ *
+ * [ collision differentiator | zap_hashbits()-bit hash value ]
+ */
+ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
}
int
@@ -1001,10 +1265,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
return (ENOENT);
if (zc->zc_zap == NULL) {
+ int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, &zc->zc_zap);
if (err)
return (err);
+
+ /*
+ * To support zap_cursor_init_serialized, advance, retrieve,
+ * we must add to the existing zc_cd, which may already
+ * be 1 due to the zap_cursor_advance.
+ */
+ ASSERT(zc->zc_hash == 0);
+ hb = zap_hashbits(zc->zc_zap);
+ zc->zc_hash = zc->zc_serialized << (64 - hb);
+ zc->zc_cd += zc->zc_serialized >> hb;
+ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+ zc->zc_cd = 0;
} else {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
}
@@ -1014,7 +1291,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
err = ENOENT;
mze_tofind.mze_hash = zc->zc_hash;
- mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+ mze_tofind.mze_cd = zc->zc_cd;
mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
if (mze == NULL) {
@@ -1022,18 +1299,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
idx, AVL_AFTER);
}
if (mze) {
- ASSERT(0 == bcmp(&mze->mze_phys,
- &zc->zc_zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict =
mzap_normalization_conflict(zc->zc_zap, NULL, mze);
za->za_integer_length = 8;
za->za_num_integers = 1;
- za->za_first_integer = mze->mze_phys.mze_value;
- (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ za->za_first_integer = mzep->mze_value;
+ (void) strcpy(za->za_name, mzep->mze_name);
zc->zc_hash = mze->mze_hash;
- zc->zc_cd = mze->mze_phys.mze_cd;
+ zc->zc_cd = mze->mze_cd;
err = 0;
} else {
zc->zc_hash = -1ULL;
@@ -1049,12 +1324,46 @@ zap_cursor_advance(zap_cursor_t *zc)
if (zc->zc_hash == -1ULL)
return;
zc->zc_cd++;
- if (zc->zc_cd >= ZAP_MAXCD) {
- zc->zc_cd = 0;
- zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
- if (zc->zc_hash == 0) /* EOF */
- zc->zc_hash = -1ULL;
+}
+
+int
+zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
+{
+ int err = 0;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+
+ zn = zap_name_alloc(zc->zc_zap, name, mt);
+ if (zn == NULL) {
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (ENOTSUP);
+ }
+
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_move_to_key(zc, zn);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = ENOENT;
+ goto out;
+ }
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_cd;
}
+
+out:
+ zap_name_free(zn);
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
}
int
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index fc25bfe1de1f..f893383a5370 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -48,6 +47,7 @@
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <acl/acl_common.h>
#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
@@ -71,8 +71,7 @@
#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
ACE_DELETE|ACE_DELETE_CHILD)
-#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
- ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@@ -319,6 +318,117 @@ static acl_ops_t zfs_acl_fuid_ops = {
zfs_ace_fuid_data
};
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
static int
zfs_acl_version(int version)
{
@@ -334,7 +444,7 @@ zfs_acl_version_zp(znode_t *zp)
return (zfs_acl_version(zp->z_zfsvfs->z_version));
}
-static zfs_acl_t *
+zfs_acl_t *
zfs_acl_alloc(int vers)
{
zfs_acl_t *aclp;
@@ -350,7 +460,7 @@ zfs_acl_alloc(int vers)
return (aclp);
}
-static zfs_acl_node_t *
+zfs_acl_node_t *
zfs_acl_node_alloc(size_t bytes)
{
zfs_acl_node_t *aclnode;
@@ -461,6 +571,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
{
zfs_acl_node_t *aclnode;
+ ASSERT(aclp);
+
if (start == NULL) {
aclnode = list_head(&aclp->z_acl);
if (aclnode == NULL)
@@ -507,6 +619,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
*who = aclp->z_ops.ace_who_get(acep);
aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
aclnode->z_ace_idx++;
+
return ((void *)acep);
}
return (NULL);
@@ -540,7 +653,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
*/
int
zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
- void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
zfs_fuid_info_t **fuidp, cred_t *cr)
{
int i;
@@ -771,8 +884,9 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
* Determine mode of file based on ACL.
* Also, create FUIDs for any User/Group ACEs
*/
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
{
int entry_type;
mode_t mode;
@@ -783,7 +897,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
uint32_t access_mask;
boolean_t an_exec_denied = B_FALSE;
- mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
while (acep = zfs_acl_next_ace(aclp, acep, &who,
&access_mask, &iflags, &type)) {
@@ -801,7 +915,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
entry_type == OWNING_GROUP))
continue;
- if (entry_type == ACE_OWNER) {
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
if ((access_mask & ACE_READ_DATA) &&
(!(seen & S_IRUSR))) {
seen |= S_IRUSR;
@@ -823,7 +938,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
mode |= S_IXUSR;
}
}
- } else if (entry_type == OWNING_GROUP) {
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
if ((access_mask & ACE_READ_DATA) &&
(!(seen & S_IRGRP))) {
seen |= S_IRGRP;
@@ -928,61 +1044,29 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
an_exec_denied = B_TRUE;
if (an_exec_denied)
- zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
else
- zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+ *pflags |= ZFS_NO_EXECS_DENIED;
return (mode);
}
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
- /*
- * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
- * Version 0 didn't have a size field, only a count.
- */
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
- aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
- } else {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
- }
-
- aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
- aclnode->z_ace_count = aclp->z_acl_count;
- if (will_modify) {
- bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
- aclp->z_acl_bytes);
- } else {
- aclnode->z_size = aclp->z_acl_bytes;
- aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
- }
-
- list_insert_head(&aclp->z_acl, aclnode);
-
- return (aclp);
-}
-
/*
* Read an external acl object. If the intent is to modify, always
* create a new acl and leave any cached acl in place.
*/
static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
{
- uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
zfs_acl_t *aclp;
- size_t aclsize;
- size_t acl_count;
+ int aclsize;
+ int acl_count;
zfs_acl_node_t *aclnode;
- int error;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+ boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
@@ -991,46 +1075,97 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
return (0);
}
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
- *aclpp = zfs_acl_node_read_internal(zp, will_modify);
- if (!will_modify)
- zp->z_acl_cached = *aclpp;
- return (0);
+ /*
+ * close race where znode could be upgrade while trying to
+ * read the znode attributes.
+ *
+ * But this could only happen if the file isn't already an SA
+ * znode
+ */
+ if (!zp->z_is_sa && !have_lock) {
+ mutex_enter(&zp->z_lock);
+ drop_lock = B_TRUE;
}
+ version = zfs_znode_acl_version(zp);
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- zfs_acl_phys_v0_t *zacl0 =
- (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
-
- aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
- acl_count = zacl0->z_acl_count;
- } else {
- aclsize = zp->z_phys->zp_acl.z_acl_size;
- acl_count = zp->z_phys->zp_acl.z_acl_count;
- if (aclsize == 0)
- aclsize = acl_count * sizeof (zfs_ace_t);
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
}
- aclnode = zfs_acl_node_alloc(aclsize);
- list_insert_head(&aclp->z_acl, aclnode);
- error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
- aclnode->z_ace_count = acl_count;
+
+ aclp = zfs_acl_alloc(version);
+
aclp->z_acl_count = acl_count;
aclp->z_acl_bytes = aclsize;
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
if (error != 0) {
zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = EIO;
- return (error);
+ goto done;
}
+ list_insert_head(&aclp->z_acl, aclnode);
+
*aclpp = aclp;
if (!will_modify)
zp->z_acl_cached = aclp;
- return (0);
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
+ return (error);
}
/*
@@ -1043,28 +1178,35 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
int
zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
{
- int error;
- znode_phys_t *zphys = zp->z_phys;
- zfs_acl_phys_t *zacl = &zphys->zp_acl;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
- uint64_t off = 0;
- dmu_object_type_t otype;
- zfs_acl_node_t *aclnode;
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
if (zp->z_acl_cached) {
zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = NULL;
}
- zphys->zp_mode = zfs_mode_compute(zp, aclp);
-
/*
- * Decide which object type to use. If we are forced to
- * use old ACL format then transform ACL into zfs_oldace_t
- * layout.
+ * Upgrade needed?
*/
if (!zfsvfs->z_use_fuids) {
otype = DMU_OT_OLDACL;
@@ -1076,84 +1218,113 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
otype = DMU_OT_ACL;
}
- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- /*
- * If ACL was previously external and we are now
- * converting to new ACL format then release old
- * ACL object and create a new one.
- */
- if (aoid && aclp->z_version != zacl->z_acl_version) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- aoid = 0;
- }
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- otype, aclp->z_acl_bytes,
- otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
- otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ zfs_acl_phys_t acl_phys;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
} else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
- aclp->z_acl_bytes, 0, tx);
- }
- zphys->zp_acl.z_acl_extern_obj = aoid;
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
- off += aclnode->z_size;
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
}
- } else {
- void *start = zacl->z_ace_data;
/*
- * Migrating back embedded?
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
*/
- if (zphys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- zphys->zp_acl.z_acl_extern_obj = 0;
- }
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- bcopy(aclnode->z_acldata, start, aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
}
- }
+ acl_phys.z_acl_version = aclp->z_version;
- /*
- * If Old version then swap count/bytes to match old
- * layout of znode_acl_phys_t.
- */
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- zphys->zp_acl.z_acl_size = aclp->z_acl_count;
- zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
- } else {
- zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
- zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
}
- zphys->zp_acl.z_acl_version = aclp->z_version;
-
/*
* Replace ACL wide bits, but first clear them.
*/
- zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
- zp->z_phys->zp_flags |= aclp->z_hints;
+ zp->z_pflags |= aclp->z_hints;
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
- zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
- return (0);
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
}
/*
@@ -1223,314 +1394,64 @@ zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep,
aclp->z_ops.ace_mask_set(acep, acepmask);
}
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- void *acep;
- int maskoff = aclp->z_ops.ace_mask_off();
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (abstract_size * 6));
-
- /*
- * Fixup final ACEs to match the mode
- */
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0700) >> 6); /* owner@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
-
- adjust_ace_pair_common(acep, maskoff, abstract_size,
- (mode & 0070) >> 3); /* group@ */
-
- acep = (caddr_t)acep + (abstract_size * 2);
- adjust_ace_pair_common(acep, maskoff,
- abstract_size, mode); /* everyone@ */
-}
-
-
-static int
-zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
- int entry_type, int accessmask)
-{
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t type = aclp->z_ops.ace_type_get(acep);
- uint16_t flags = aclp->z_ops.ace_flags_get(acep);
-
- return (mask == accessmask && type == allow_deny &&
- ((flags & ACE_TYPE_FLAGS) == entry_type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
-{
- int okay_masks;
- uint16_t prevtype;
- uint16_t prevflags;
- uint16_t flags;
- uint32_t mask, prevmask;
-
- if (prevacep == NULL)
- return (B_FALSE);
-
- prevtype = aclp->z_ops.ace_type_get(prevacep);
- prevflags = aclp->z_ops.ace_flags_get(prevacep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- prevmask = aclp->z_ops.ace_mask_get(prevacep);
-
- if (prevtype != DENY)
- return (B_FALSE);
-
- if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
- return (B_FALSE);
-
- okay_masks = (mask & OKAY_MASK_BITS);
-
- if (prevmask & ~okay_masks)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Insert new ACL node into chain of zfs_acl_node_t's
- *
- * This will result in two possible results.
- * 1. If the ACL is currently just a single zfs_acl_node and
- * we are prepending the entry then current acl node will have
- * a new node inserted above it.
- *
- * 2. If we are inserting in the middle of current acl node then
- * the current node will be split in two and new node will be inserted
- * in between the two split nodes.
- */
-static zfs_acl_node_t *
-zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep)
-{
- zfs_acl_node_t *newnode;
- zfs_acl_node_t *trailernode = NULL;
- zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp);
- int curr_idx = aclp->z_curr_node->z_ace_idx;
- int trailer_count;
- size_t oldsize;
-
- newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
- newnode->z_ace_count = 1;
-
- oldsize = currnode->z_size;
-
- if (curr_idx != 1) {
- trailernode = zfs_acl_node_alloc(0);
- trailernode->z_acldata = acep;
-
- trailer_count = currnode->z_ace_count - curr_idx + 1;
- currnode->z_ace_count = curr_idx - 1;
- currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
- trailernode->z_size = oldsize - currnode->z_size;
- trailernode->z_ace_count = trailer_count;
- }
-
- aclp->z_acl_count += 1;
- aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
-
- if (curr_idx == 1)
- list_insert_before(&aclp->z_acl, currnode, newnode);
- else
- list_insert_after(&aclp->z_acl, currnode, newnode);
- if (trailernode) {
- list_insert_after(&aclp->z_acl, newnode, trailernode);
- aclp->z_curr_node = trailernode;
- trailernode->z_ace_idx = 1;
- }
-
- return (newnode);
-}
-
-/*
- * Prepend deny ACE
- */
-static void *
-zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
- mode_t mode)
-{
- zfs_acl_node_t *aclnode;
- void *newacep;
- uint64_t fuid;
- uint16_t flags;
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
- fuid = aclp->z_ops.ace_who_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
- zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
-
- return (newacep);
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
static void
-zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
{
- zfs_acl_node_t *aclnode;
- zfs_acl_node_t *currnode;
- void *newacep;
- uint16_t type, flags;
- uint32_t mask;
- uint64_t fuid;
-
- type = aclp->z_ops.ace_type_get(acep);
- flags = aclp->z_ops.ace_flags_get(acep);
- mask = aclp->z_ops.ace_mask_get(acep);
- fuid = aclp->z_ops.ace_who_get(acep);
-
- aclnode = zfs_acl_ace_insert(aclp, acep);
- newacep = aclnode->z_acldata;
-
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
- aclp->z_ops.ace_mask_set(newacep, mask);
- aclp->z_ops.ace_type_set(newacep, type);
- aclp->z_ops.ace_who_set(newacep, fuid);
- aclp->z_next_ace = acep;
- flags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep, flags);
- currnode = zfs_acl_curr_node(aclp);
- ASSERT(currnode->z_ace_idx >= 1);
- currnode->z_ace_idx -= 1;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp)
-{
- void *acep;
- zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
- int i = 0;
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
- ASSERT(aclnode != NULL);
-
- if (aclnode->z_ace_count < 6)
- return (0);
-
- acep = (void *)((caddr_t)aclnode->z_acldata +
- aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
-
- if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_OWNER, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
- OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
- (abstract_size * i++),
- ALLOW, OWNING_GROUP, 0) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
- zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
- ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
- return (1);
- } else {
- return (0);
- }
-}
-
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner. If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
- mode_t mode)
-{
- uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
- uint32_t mask = aclp->z_ops.ace_mask_get(acep);
- uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
- mode_t extramode = (mode >> 3) & 07;
- mode_t ownermode = (mode >> 6);
-
- if (prevflags & ACE_IDENTIFIER_GROUP) {
-
- extramode &= ~ownermode;
-
- if (extramode) {
- if (extramode & S_IROTH) {
- prevmask &= ~ACE_READ_DATA;
- mask &= ~ACE_READ_DATA;
- }
- if (extramode & S_IWOTH) {
- prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- }
- if (extramode & S_IXOTH) {
- prevmask &= ~ACE_EXECUTE;
- mask &= ~ACE_EXECUTE;
- }
- }
- }
- aclp->z_ops.ace_mask_set(acep, mask);
- aclp->z_ops.ace_mask_set(prevacep, prevmask);
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static void
-zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
- uint64_t mode, zfs_acl_t *aclp)
-{
- void *acep = NULL, *prevacep = NULL;
+ void *acep = NULL;
uint64_t who;
- int i;
+ int new_count, new_bytes;
+ int ace_size;
int entry_type;
- int reuse_deny;
- int need_canonical_six = 1;
uint16_t iflags, type;
uint32_t access_mask;
-
- /*
- * If discard then just discard all ACL nodes which
- * represent the ACEs.
- *
- * New owner@/group@/everone@ ACEs will be added
- * later.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
- zfs_acl_release_nodes(aclp);
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops.ace_abstract_size();
+ void *zacep;
+ uint32_t owner, group, everyone;
+ uint32_t deny1, deny2, allow0;
+
+ new_count = new_bytes = 0;
+
+ acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2,
+ &owner, &group, &everyone);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (allow0) {
+ zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ } if (deny1) {
+ zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (deny2) {
+ zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
+ uint16_t inherit_flags;
entry_type = (iflags & ACE_TYPE_FLAGS);
- iflags = (iflags & ALL_INHERIT);
+ inherit_flags = (iflags & ALL_INHERIT);
+
+ if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) &&
+ ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) {
+ continue;
+ }
if ((type != ALLOW && type != DENY) ||
- (iflags & ACE_INHERIT_ONLY_ACE)) {
- if (iflags)
+ (inherit_flags & ACE_INHERIT_ONLY_ACE)) {
+ if (inherit_flags)
aclp->z_hints |= ZFS_INHERIT_ACE;
switch (type) {
case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -1540,116 +1461,58 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
aclp->z_hints |= ZFS_ACL_OBJ_ACE;
break;
}
- goto nextace;
- }
-
- /*
- * Need to split ace into two?
- */
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) &&
- (!(iflags & ACE_INHERIT_ONLY_ACE))) {
- zfs_acl_split_ace(aclp, acep);
- aclp->z_hints |= ZFS_INHERIT_ACE;
- goto nextace;
- }
-
- if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- (entry_type == OWNING_GROUP)) {
- access_mask &= ~OGE_CLEAR;
- aclp->z_ops.ace_mask_set(acep, access_mask);
- goto nextace;
} else {
- reuse_deny = B_TRUE;
- if (type == ALLOW) {
-
- /*
- * Check preceding ACE if any, to see
- * if we need to prepend a DENY ACE.
- * This is only applicable when the acl_mode
- * property == groupmask.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
- reuse_deny = zfs_reuse_deny(aclp, acep,
- prevacep);
-
- if (!reuse_deny) {
- prevacep =
- zfs_acl_prepend_deny(uid,
- aclp, acep, mode);
- } else {
- zfs_acl_prepend_fixup(
- aclp, prevacep,
- acep, mode, uid);
- }
- zfs_fixup_group_entries(aclp, acep,
- prevacep, mode);
- }
- }
- }
-nextace:
- prevacep = acep;
- }
-
- /*
- * Check out last six aces, if we have six.
- */
- if (aclp->z_acl_count >= 6) {
- if (zfs_have_canonical_six(aclp)) {
- need_canonical_six = 0;
+ /*
+ * Limit permissions to be no greater than
+ * group permissions
+ */
+ if (type == ALLOW && zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) {
+ if (!(mode & S_IRGRP))
+ access_mask &= ~ACE_READ_DATA;
+ if (!(mode & S_IWGRP))
+ access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ if (!(mode & S_IXGRP))
+ access_mask &= ~ACE_EXECUTE;
+ access_mask &=
+ ~(ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS);
+ }
}
- }
-
- if (need_canonical_six) {
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
- void *zacep;
- zfs_acl_node_t *aclnode =
- zfs_acl_node_alloc(abstract_size * 6);
-
- aclnode->z_size = abstract_size * 6;
- aclnode->z_ace_count = 6;
- aclp->z_acl_bytes += aclnode->z_size;
- list_insert_tail(&aclp->z_acl, aclnode);
-
- zacep = aclnode->z_acldata;
-
- i = 0;
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- 0, DENY, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- DENY, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
- ALLOW, -1, OWNING_GROUP);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
- zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
- EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
- aclp->z_acl_count += 6;
- }
-
- zfs_acl_fixup_canonical_six(aclp, mode);
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops.ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
}
-int
+void
zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
{
- int error;
-
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
- *aclp = NULL;
- error = zfs_acl_node_read(zp, aclp, B_TRUE);
- if (error == 0) {
- (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
- zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
- }
- mutex_exit(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp);
mutex_exit(&zp->z_lock);
- return (error);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(*aclp);
}
/*
@@ -1691,8 +1554,8 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
uint64_t mode, boolean_t *need_chmod)
{
void *pacep;
- void *acep, *acep2;
- zfs_acl_node_t *aclnode, *aclnode2;
+ void *acep;
+ zfs_acl_node_t *aclnode;
zfs_acl_t *aclp = NULL;
uint64_t who;
uint32_t access_mask;
@@ -1714,7 +1577,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
*need_chmod = B_TRUE;
pacep = NULL;
aclp = zfs_acl_alloc(paclp->z_version);
- if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD)
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK)
return (aclp);
while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
&access_mask, &iflags, &type)) {
@@ -1743,11 +1606,11 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
OWNING_GROUP)) && (vreg || (vdir && (iflags &
ACE_DIRECTORY_INHERIT_ACE)))) {
*need_chmod = B_FALSE;
+ }
- if (!vdir && passthrough_x &&
- ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
- access_mask &= ~ACE_EXECUTE;
- }
+ if (!vdir && passthrough_x &&
+ ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
}
aclnode = zfs_acl_node_alloc(ace_size);
@@ -1765,6 +1628,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
&data2)) == data1sz);
bcopy(data1, data2, data2sz);
}
+
aclp->z_acl_count++;
aclnode->z_ace_count++;
aclp->z_acl_bytes += aclnode->z_size;
@@ -1783,38 +1647,17 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
ASSERT(vdir);
- newflags = aclp->z_ops.ace_flags_get(acep);
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
if ((iflags & (ACE_FILE_INHERIT_ACE |
- ACE_DIRECTORY_INHERIT_ACE)) !=
- ACE_FILE_INHERIT_ACE) {
- aclnode2 = zfs_acl_node_alloc(ace_size);
- list_insert_tail(&aclp->z_acl, aclnode2);
- acep2 = aclnode2->z_acldata;
- zfs_set_ace(aclp, acep2,
- access_mask, type, who,
- iflags|ACE_INHERITED_ACE);
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
newflags |= ACE_INHERIT_ONLY_ACE;
- aclp->z_ops.ace_flags_set(acep, newflags);
- newflags &= ~ALL_INHERIT;
- aclp->z_ops.ace_flags_set(acep2,
+ aclp->z_ops.ace_flags_set(acep,
newflags|ACE_INHERITED_ACE);
-
- /*
- * Copy special opaque data if any
- */
- if ((data1sz = aclp->z_ops.ace_data(acep,
- &data1)) != 0) {
- VERIFY((data2sz =
- aclp->z_ops.ace_data(acep2,
- &data2)) == data1sz);
- bcopy(data1, data2, data1sz);
- }
- aclp->z_acl_count++;
- aclnode2->z_ace_count++;
- aclp->z_acl_bytes += aclnode->z_size;
- zfs_restricted_update(zfsvfs, aclp, acep2);
} else {
- newflags |= ACE_INHERIT_ONLY_ACE;
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
aclp->z_ops.ace_flags_set(acep,
newflags|ACE_INHERITED_ACE);
}
@@ -1835,6 +1678,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
zfs_acl_t *paclp;
gid_t gid;
boolean_t need_chmod = B_TRUE;
+ boolean_t inherited = B_FALSE;
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1843,7 +1687,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
&acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
return (error);
-
/*
* Determine uid and gid.
*/
@@ -1865,21 +1708,36 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
(uint64_t)vap->va_gid,
cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
- if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
+ if (acl_ids->z_fgid != dzp->z_gid &&
!groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0)
acl_ids->z_fgid = 0;
}
if (acl_ids->z_fgid == 0) {
- if (dzp->z_phys->zp_mode & S_ISGID) {
- acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ if (dzp->z_mode & S_ISGID) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = dzp->z_gid;
gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
} else {
acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
ZFS_GROUP, cr, &acl_ids->z_fuidp);
#ifdef __FreeBSD__
- gid = acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ gid = acl_ids->z_fgid = dzp->z_gid;
#else
gid = crgetgid(cr);
#endif
@@ -1894,7 +1752,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
* file's new group, clear the file's set-GID bit.
*/
- if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
(vap->va_type == VDIR)) {
acl_ids->z_mode |= S_ISGID;
} else {
@@ -1904,28 +1762,38 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
}
if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
- (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
- !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
- mutex_enter(&dzp->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
- mutex_exit(&dzp->z_acl_lock);
+ (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
} else {
acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
mutex_exit(&dzp->z_lock);
+ mutex_exit(&dzp->z_acl_lock);
if (need_chmod) {
- acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+ acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
ZFS_ACL_AUTO_INHERIT : 0;
- zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
- acl_ids->z_mode, acl_ids->z_aclp);
+ zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp);
}
}
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
return (0);
}
@@ -1946,8 +1814,8 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
boolean_t
zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
{
- return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
- zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+ return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+ zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
}
/*
@@ -1965,15 +1833,15 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
- if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
- return (error);
-
if (mask == 0)
return (ENOSYS);
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
+ return (error);
+
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1982,8 +1850,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
/*
* Scan ACL to determine number of ACEs
*/
- if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
- !(mask & VSA_ACE_ALLTYPES)) {
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
void *zacep = NULL;
uint64_t who;
uint32_t access_mask;
@@ -2004,7 +1871,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
}
vsecp->vsa_aclcnt = count;
} else
- count = aclp->z_acl_count;
+ count = (int)aclp->z_acl_count;
if (mask & VSA_ACECNT) {
vsecp->vsa_aclcnt = count;
@@ -2038,11 +1905,11 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
}
if (mask & VSA_ACE_ACLFLAGS) {
vsecp->vsa_aclflags = 0;
- if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
vsecp->vsa_aclflags |= ACL_DEFAULTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
vsecp->vsa_aclflags |= ACL_PROTECTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
}
@@ -2120,11 +1987,12 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
zfs_acl_t *aclp;
zfs_fuid_info_t *fuidp = NULL;
boolean_t fuid_dirtied;
+ uint64_t acl_obj;
if (mask == 0)
return (ENOSYS);
- if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
+ if (zp->z_pflags & ZFS_IMMUTABLE)
return (EPERM);
if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
@@ -2140,37 +2008,41 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
* existing flags.
*/
if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
- aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
}
top:
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL? */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- zp->z_phys->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
} else {
- dmu_tx_hold_write(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, aclp->z_acl_bytes);
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
}
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
}
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
mutex_exit(&zp->z_acl_lock);
@@ -2188,20 +2060,20 @@ top:
error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
if (fuidp)
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
done:
- mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
return (error);
}
@@ -2226,15 +2098,15 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
*/
if ((v4_mode & WRITE_MASK_DATA) &&
(((ZTOV(zp)->v_type != VDIR) &&
- (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
(ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
+ (zp->z_pflags & ZFS_IMMUTABLE)))) {
return (EPERM);
}
#ifdef sun
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+ (zp->z_pflags & ZFS_NOUNLINK)) {
return (EPERM);
}
#else
@@ -2244,13 +2116,13 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
* handled in zfs_zaccess_delete().
*/
if ((v4_mode & ACE_DELETE) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+ (zp->z_pflags & ZFS_NOUNLINK)) {
return (EPERM);
}
#endif
if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
return (EACCES);
}
@@ -2297,19 +2169,21 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
- uid_t fowner;
uid_t gowner;
+ uid_t fowner;
zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
}
+ ASSERT(zp->z_acl_cached);
+
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
uint32_t mask_matched;
@@ -2409,18 +2283,10 @@ zfs_has_access(znode_t *zp, cred_t *cr)
uint32_t have = ACE_ALL_PERMS;
if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
- uid_t owner;
-
- owner = zfs_fuid_map_id(zp->z_zfsvfs,
- zp->z_phys->zp_uid, cr, ZFS_OWNER);
+ uid_t owner;
- return (
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
- secpolicy_vnode_chown(ZTOV(zp), cr, owner) == 0 ||
- secpolicy_vnode_setdac(ZTOV(zp), cr, owner) == 0 ||
- secpolicy_vnode_remove(ZTOV(zp), cr) == 0);
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
}
return (B_TRUE);
}
@@ -2478,38 +2344,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
boolean_t owner = B_FALSE;
boolean_t groupmbr = B_FALSE;
boolean_t is_attr;
- uid_t fowner;
- uid_t gowner;
uid_t uid = crgetuid(cr);
int error;
- if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
- is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
(ZTOV(zdp)->v_type == VDIR));
if (is_attr)
goto slow;
+
mutex_enter(&zdp->z_acl_lock);
- if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
- if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
- FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
mutex_exit(&zdp->z_acl_lock);
goto slow;
}
- fowner = (uid_t)zdp->z_phys->zp_uid;
- gowner = (uid_t)zdp->z_phys->zp_gid;
-
- if (uid == fowner) {
+ if (uid == zdp->z_uid) {
owner = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXUSR) {
+ if (zdp->z_mode & S_IXUSR) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2517,9 +2378,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
goto slow;
}
}
- if (groupmember(gowner, cr)) {
+ if (groupmember(zdp->z_gid, cr)) {
groupmbr = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXGRP) {
+ if (zdp->z_mode & S_IXGRP) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2528,7 +2389,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
}
}
if (!owner && !groupmbr) {
- if (zdp->z_phys->zp_mode & S_IXOTH) {
+ if (zdp->z_mode & S_IXOTH) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
@@ -2545,8 +2406,9 @@ slow:
}
/*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
+ * Determine whether Access should be granted/denied.
+ * The least priv subsytem is always consulted as a basic privilege
+ * can define any form of access.
*/
int
zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
@@ -2554,13 +2416,13 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
uint32_t working_mode;
int error;
int is_attr;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
boolean_t check_privs;
znode_t *xzp;
znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
- is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
- (ZTOV(zp)->v_type == VDIR));
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
#ifdef __FreeBSD__
/*
@@ -2568,15 +2430,22 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
* Note that not checking them is not just an optimization - without
* this shortcut, EA operations may bogusly fail with EACCES.
*/
- if (zp->z_phys->zp_flags & ZFS_XATTR)
+ if (zp->z_pflags & ZFS_XATTR)
return (0);
#else
/*
* If attribute then validate against base file
*/
if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
if ((error = zfs_zget(zp->z_zfsvfs,
- zp->z_phys->zp_parent, &xzp)) != 0) {
+ parent, &xzp)) != 0) {
return (error);
}
@@ -2598,11 +2467,36 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
#endif
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+ * in needed_bits. Map the bits mapped by working_mode (currently
+ * missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= VEXEC;
+
if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
&check_privs, skipaclchk, cr)) == 0) {
if (is_attr)
VN_RELE(ZTOV(xzp));
- return (0);
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits));
}
if (error && !check_privs) {
@@ -2616,12 +2510,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
if (error && check_privs) {
- uid_t owner;
mode_t checkmode = 0;
- owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
- ZFS_OWNER);
-
/*
* First check for implicit owner permission on
* read_acl/read_attributes
@@ -2643,9 +2533,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
if (working_mode & ACE_EXECUTE)
checkmode |= VEXEC;
- if (checkmode)
- error = secpolicy_vnode_access(cr, ZTOV(check_zp),
- owner, checkmode);
+ error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
+ needed_bits & ~checkmode, needed_bits);
if (error == 0 && (working_mode & ACE_WRITE_OWNER))
error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
@@ -2668,8 +2557,12 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
error = EACCES;
}
}
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits);
}
+
if (is_attr)
VN_RELE(ZTOV(xzp));
@@ -2699,15 +2592,15 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
static int
zfs_delete_final_check(znode_t *zp, znode_t *dzp,
- mode_t missing_perms, cred_t *cr)
+ mode_t available_perms, cred_t *cr)
{
int error;
uid_t downer;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
- error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
+ error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+ downer, available_perms, VWRITE|VEXEC);
if (error == 0)
error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -2756,7 +2649,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
uint32_t dzp_working_mode = 0;
uint32_t zp_working_mode = 0;
int dzp_error, zp_error;
- mode_t missing_perms;
+ mode_t available_perms;
boolean_t dzpcheck_privs = B_TRUE;
boolean_t zpcheck_privs = B_TRUE;
@@ -2774,7 +2667,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
* to determine what was found.
*/
- if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
return (EPERM);
/*
@@ -2817,23 +2710,20 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
* only need to see if we have write/execute on directory.
*/
- if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
- return (zfs_sticky_remove_access(dzp, zp, cr));
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
- if (!dzpcheck_privs)
+ if (dzp_error != 0 && !dzpcheck_privs)
return (dzp_error);
/*
* Fourth row
*/
- missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
- missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
-
- ASSERT(missing_perms);
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
- return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
}
@@ -2844,7 +2734,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
int add_perm;
int error;
- if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
add_perm = (ZTOV(szp)->v_type == VDIR) ?
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
index cd36696f9500..acf632bdbeff 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,6 +27,7 @@
#include <sys/vfs.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
#include <sys/zfs_acl.h>
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index 48c3ebf78a58..7372ee74d3ea 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -110,17 +110,41 @@ snapentry_compare(const void *a, const void *b)
return (0);
}
+#ifdef sun
+vnodeops_t *zfsctl_ops_root;
+vnodeops_t *zfsctl_ops_snapdir;
+vnodeops_t *zfsctl_ops_snapshot;
+vnodeops_t *zfsctl_ops_shares;
+vnodeops_t *zfsctl_ops_shares_dir;
+
+static const fs_operation_def_t zfsctl_tops_root[];
+static const fs_operation_def_t zfsctl_tops_snapdir[];
+static const fs_operation_def_t zfsctl_tops_snapshot[];
+static const fs_operation_def_t zfsctl_tops_shares[];
+#else /* !sun */
static struct vop_vector zfsctl_ops_root;
static struct vop_vector zfsctl_ops_snapdir;
static struct vop_vector zfsctl_ops_snapshot;
static struct vop_vector zfsctl_ops_shares;
static struct vop_vector zfsctl_ops_shares_dir;
+#endif /* !sun */
static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
static vnode_t *zfsctl_mknode_shares(vnode_t *);
static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
+#ifdef sun
+static gfs_opsvec_t zfsctl_opsvec[] = {
+ { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
+ { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
+ { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+ { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
+ { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
+ { NULL }
+};
+#endif /* sun */
+
/*
* Root directory elements. We only have two entries
* snapshot and shares.
@@ -144,11 +168,35 @@ static gfs_dirent_t zfsctl_root_entries[] = {
void
zfsctl_init(void)
{
+#ifdef sun
+ VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+#endif
}
void
zfsctl_fini(void)
{
+#ifdef sun
+ /*
+ * Remove vfsctl vnode ops
+ */
+ if (zfsctl_ops_root)
+ vn_freevnodeops(zfsctl_ops_root);
+ if (zfsctl_ops_snapdir)
+ vn_freevnodeops(zfsctl_ops_snapdir);
+ if (zfsctl_ops_snapshot)
+ vn_freevnodeops(zfsctl_ops_snapshot);
+ if (zfsctl_ops_shares)
+ vn_freevnodeops(zfsctl_ops_shares);
+ if (zfsctl_ops_shares_dir)
+ vn_freevnodeops(zfsctl_ops_shares_dir);
+
+ zfsctl_ops_root = NULL;
+ zfsctl_ops_snapdir = NULL;
+ zfsctl_ops_snapshot = NULL;
+ zfsctl_ops_shares = NULL;
+ zfsctl_ops_shares_dir = NULL;
+#endif /* sun */
}
boolean_t
@@ -191,6 +239,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
{
vnode_t *vp, *rvp;
zfsctl_node_t *zcp;
+ uint64_t crtime[2];
ASSERT(zfsvfs->z_ctldir == NULL);
@@ -201,7 +250,9 @@ zfsctl_create(zfsvfs_t *zfsvfs)
zcp->zc_id = ZFSCTL_INO_ROOT;
VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
- ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof (crtime)));
+ ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
VN_URELE(rvp);
/*
@@ -273,12 +324,12 @@ static int
zfsctl_common_access(ap)
struct vop_access_args /* {
struct vnode *a_vp;
- int a_accmode;
+ accmode_t a_accmode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
- int mode = ap->a_accmode;
+ accmode_t accmode = ap->a_accmode;
#ifdef TODO
if (flags & V_ACE_MASK) {
@@ -286,8 +337,8 @@ zfsctl_common_access(ap)
return (EACCES);
} else {
#endif
- if (mode & VWRITE)
- return (EACCES);
+ if (accmode & VWRITE)
+ return (EACCES);
#ifdef TODO
}
#endif
@@ -301,14 +352,13 @@ zfsctl_common_access(ap)
static void
zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
{
- zfsctl_node_t *zcp = vp->v_data;
timestruc_t now;
vap->va_uid = 0;
vap->va_gid = 0;
vap->va_rdev = 0;
/*
- * We are a purly virtual object, so we have no
+ * We are a purely virtual object, so we have no
* blocksize or allocated blocks.
*/
vap->va_blksize = 0;
@@ -323,7 +373,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
*/
gethrestime(&now);
vap->va_atime = now;
- vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
/* FreeBSD: Reset chflags(2) flags. */
vap->va_flags = 0;
}
@@ -363,6 +412,7 @@ zfsctl_common_fid(ap)
return (0);
}
+
/*ARGSUSED*/
static int
zfsctl_shares_fid(ap)
@@ -436,16 +486,18 @@ zfsctl_root_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
struct vnode *vp = ap->a_vp;
struct vattr *vap = ap->a_vap;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_node_t *zcp = vp->v_data;
ZFS_ENTER(zfsvfs);
vap->va_nodeid = ZFSCTL_INO_ROOT;
vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+ vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+ vap->va_birthtime = vap->va_ctime;
zfsctl_common_getattr(vp, vap);
ZFS_EXIT(zfsvfs);
@@ -453,6 +505,40 @@ zfsctl_root_getattr(ap)
return (0);
}
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ /*
+ * No extended attributes allowed under .zfs
+ */
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(nm, "..") == 0) {
+ err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
+ if (err == 0)
+ VOP_UNLOCK(*vpp, 0);
+ } else {
+ err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (err);
+}
+
#ifdef sun
static int
zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
@@ -493,40 +579,6 @@ static const fs_operation_def_t zfsctl_tops_root[] = {
*/
/* ARGSUSED */
int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(nm, "..") == 0) {
- err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
- if (err == 0)
- VOP_UNLOCK(*vpp, 0);
- } else {
- err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
- cr, ct, direntflags, realpnp);
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (err);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
zfsctl_freebsd_root_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
@@ -551,7 +603,6 @@ zfsctl_freebsd_root_lookup(ap)
err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
return (err);
}
@@ -566,6 +617,9 @@ static struct vop_vector zfsctl_ops_root = {
.vop_lookup = zfsctl_freebsd_root_lookup,
.vop_inactive = gfs_vop_inactive,
.vop_reclaim = zfsctl_common_reclaim,
+#ifdef TODO
+ .vop_pathconf = zfsctl_pathconf,
+#endif
.vop_fid = zfsctl_common_fid,
};
@@ -596,10 +650,32 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
if ((error = vn_vfswlock(svp)) != 0)
return (error);
+#ifdef sun
+ VN_HOLD(svp);
+ error = dounmount(vn_mountedvfs(svp), fflags, cr);
+ if (error) {
+ VN_RELE(svp);
+ return (error);
+ }
+
+ /*
+ * We can't use VN_RELE(), as that will try to invoke
+ * zfsctl_snapdir_inactive(), which would cause us to destroy
+ * the sd_lock mutex held by our caller.
+ */
+ ASSERT(svp->v_count == 1);
+ gfs_vop_inactive(svp, cr, NULL);
+
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ return (0);
+#else /* !sun */
return (dounmount(vn_mountedvfs(svp), fflags, curthread));
+#endif /* !sun */
}
-#if 0
+#ifdef sun
static void
zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
{
@@ -639,7 +715,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath);
+ vfs_setmntpoint(vfsp, newpath, 0);
pathref = vfs_getresource(vfsp);
(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
@@ -648,13 +724,13 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setresource(vfsp, newpath);
+ vfs_setresource(vfsp, newpath, 0);
vfs_unlock(vfsp);
}
-#endif
+#endif /* sun */
-#if 0
+#ifdef sun
/*ARGSUSED*/
static int
zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
@@ -717,9 +793,9 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
return (err);
}
-#endif
+#endif /* sun */
-#if 0
+#ifdef sun
/* ARGSUSED */
static int
zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
@@ -769,7 +845,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
avl_insert(&sdp->sd_snaps, sep, where);
} else
- err = dmu_objset_destroy(snapname);
+ err = dmu_objset_destroy(snapname, B_FALSE);
} else {
err = ENOENT;
}
@@ -778,7 +854,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
return (err);
}
-#endif
+#endif /* sun */
/*
* This creates a snapshot under '.zfs/snapshot'.
@@ -806,7 +882,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
return (err);
if (err == 0) {
- err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
+ err = dmu_objset_snapshot(name, dirname, NULL, NULL,
+ B_FALSE, B_FALSE, -1);
if (err)
return (err);
err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@@ -951,8 +1028,7 @@ zfsctl_snapdir_lookup(ap)
*/
return (err == EILSEQ ? ENOENT : err);
}
- if (dmu_objset_open(snapname, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
+ if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
mutex_exit(&sdp->sd_lock);
/* Translate errors and add SAVENAME when needed. */
if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
@@ -972,7 +1048,7 @@ zfsctl_snapdir_lookup(ap)
VN_HOLD(*vpp);
avl_insert(&sdp->sd_snaps, sep, where);
- dmu_objset_close(snap);
+ dmu_objset_rele(snap, FTAG);
domount:
mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
@@ -1194,6 +1270,8 @@ zfsctl_shares_getattr(ap)
}
ZFS_EXIT(zfsvfs);
return (error);
+
+
}
/* ARGSUSED */
@@ -1203,11 +1281,10 @@ zfsctl_snapdir_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
- struct vnode *vp = ap->a_vp;
- struct vattr *vap = ap->a_vap;
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
zfsctl_snapdir_t *sdp = vp->v_data;
@@ -1215,6 +1292,8 @@ zfsctl_snapdir_getattr(ap)
zfsctl_common_getattr(vp, vap);
vap->va_nodeid = gfs_file_inode(vp);
vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+ vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_birthtime = vap->va_ctime;
ZFS_EXIT(zfsvfs);
return (0);
@@ -1251,6 +1330,38 @@ zfsctl_snapdir_inactive(ap)
return (0);
}
+#ifdef sun
+static const fs_operation_def_t zfsctl_tops_snapdir[] = {
+ { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
+ { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
+ { VOPNAME_IOCTL, { .error = fs_inval } },
+ { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
+ { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } },
+ { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } },
+ { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } },
+ { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } },
+ { VOPNAME_SEEK, { .vop_seek = fs_seek } },
+ { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } },
+ { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
+ { NULL }
+};
+
+static const fs_operation_def_t zfsctl_tops_shares[] = {
+ { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
+ { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
+ { VOPNAME_IOCTL, { .error = fs_inval } },
+ { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
+ { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } },
+ { VOPNAME_SEEK, { .vop_seek = fs_seek } },
+ { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
+ { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } },
+ { NULL }
+};
+#else /* !sun */
static struct vop_vector zfsctl_ops_snapdir = {
.vop_default = &default_vnodeops,
.vop_open = zfsctl_common_open,
@@ -1279,6 +1390,7 @@ static struct vop_vector zfsctl_ops_shares = {
.vop_reclaim = zfsctl_common_reclaim,
.vop_fid = zfsctl_shares_fid,
};
+#endif /* !sun */
/*
* pvp is the GFS vnode '.zfs/snapshot'.
@@ -1347,8 +1459,8 @@ zfsctl_snapshot_inactive(ap)
if (!locked)
mutex_exit(&sdp->sd_lock);
VN_RELE(dvp);
-end:
+end:
/*
* Dispose of the vnode for the snapshot mount point.
* This is safe to do because once this entry has been removed
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
new file mode 100644
index 000000000000..d0f411a99350
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ zfs_dbgmsg_t *zdm;
+
+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_destroy(&zfs_dbgmsgs_lock);
+ ASSERT3U(zfs_dbgmsg_size, ==, 0);
+}
+
+/*
+ * Print these messages by running:
+ * echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ zfs_dbgmsg_t *zdm;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx);
+ va_end(adx);
+
+ /*
+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+ * for the terminating null.
+ */
+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+ zdm->zdm_timestamp = gethrestime_sec();
+
+ va_start(adx, fmt);
+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index 3ac4741cffc9..bae9071c3c39 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -52,6 +51,8 @@
#include <sys/atomic.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/extdirent.h>
@@ -286,8 +287,10 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
* See if there's an object by this name; if so, put a hold on it.
*/
if (flag & ZXATTR) {
- zoid = dzp->z_phys->zp_xattr;
- error = (zoid == 0 ? ENOENT : 0);
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
} else {
if (update)
vp = dnlc_lookup(ZTOV(dzp), name);
@@ -379,25 +382,29 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
zfs_dirlock_t *dl;
znode_t *zp;
int error = 0;
+ uint64_t parent;
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
*vpp = ZTOV(dzp);
VN_HOLD(*vpp);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
- if (dzp->z_phys->zp_parent == dzp->z_id &&
- zfsvfs->z_parent != zfsvfs) {
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+ if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", vpp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
return (error);
}
rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ error = zfs_zget(zfsvfs, parent, &zp);
if (error == 0)
*vpp = ZTOV(zp);
rw_exit(&dzp->z_parent_lock);
@@ -445,7 +452,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_phys->zp_links, ==, 0);
+ ASSERT(zp->z_links == 0);
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -540,10 +547,12 @@ zfs_purgedir(znode_t *dzp)
(ZTOV(xzp)->v_type == VLNK));
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -576,15 +585,16 @@ zfs_rmnode(znode_t *zp)
znode_t *xzp = NULL;
dmu_tx_t *tx;
uint64_t acl_obj;
+ uint64_t xattr_obj;
int error;
- ASSERT(zp->z_phys->zp_links == 0);
+ ASSERT(zp->z_links == 0);
/*
* If this is an attribute directory, purge its contents.
*/
if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_XATTR)) {
+ (zp->z_pflags & ZFS_XATTR)) {
if (zfs_purgedir(zp) != 0) {
/*
* Not enough space to delete some xattrs.
@@ -613,12 +623,14 @@ zfs_rmnode(znode_t *zp)
* If the file has extended attributes, we're going to unlink
* the xattr dir.
*/
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT(error == 0);
}
- acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ acl_obj = zfs_external_acl(zp);
/*
* Set up the final transaction.
@@ -627,11 +639,13 @@ zfs_rmnode(znode_t *zp)
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
if (xzp) {
- dmu_tx_hold_bonus(tx, xzp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
if (acl_obj)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
/*
@@ -646,10 +660,12 @@ zfs_rmnode(znode_t *zp)
}
if (xzp) {
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ ASSERT(error == 0);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- xzp->z_phys->zp_links = 0; /* no more links to it */
+ xzp->z_links = 0; /* no more links to it */
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
@@ -667,11 +683,12 @@ out:
}
static uint64_t
-zfs_dirent(znode_t *zp)
+zfs_dirent(znode_t *zp, uint64_t mode)
{
uint64_t de = zp->z_id;
+
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
- de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+ de |= IFTODT(mode) << 60;
return (de);
}
@@ -682,12 +699,15 @@ int
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
uint64_t value;
int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
if (!(flag & ZRENAMING)) {
@@ -696,22 +716,47 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
mutex_exit(&zp->z_lock);
return (ENOENT);
}
- zp->z_phys->zp_links++;
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime, B_TRUE);
}
- zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
- if (!(flag & ZNEW))
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
mutex_exit(&zp->z_lock);
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size++; /* one dirent added */
- dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
- value = zfs_dirent(zp);
+ value = zfs_dirent(zp, zp->z_mode);
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
8, 1, &value, tx);
ASSERT(error == 0);
@@ -721,6 +766,30 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
return (0);
}
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (zp->z_zfsvfs->z_norm) {
+ if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
+ (flag & ZCIEXACT)) ||
+ ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(flag & ZCILOOK)))
+ error = zap_remove_norm(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, MT_EXACT, tx);
+ else
+ error = zap_remove_norm(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, MT_FIRST, tx);
+ } else {
+ error = zap_remove(zp->z_zfsvfs->z_os,
+ dzp->z_id, dl->dl_name, tx);
+ }
+
+ return (error);
+}
+
/*
* Unlink zp from dl, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
@@ -733,16 +802,18 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
boolean_t *unlinkedp)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
dnlc_remove(ZTOV(dzp), dl->dl_name);
if (!(flag & ZRENAMING)) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
return (EBUSY);
@@ -752,51 +823,74 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
}
mutex_enter(&zp->z_lock);
- if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
return (ENOTEMPTY);
}
- if (zp->z_phys->zp_links <= zp_is_dir) {
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0) {
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ return (error);
+ }
+
+ if (zp->z_links <= zp_is_dir) {
zfs_panic_recover("zfs: link count on vnode %p is %u, "
"should be at least %u", zp->z_vnode,
- (int)zp->z_phys->zp_links,
+ (int)zp->z_links,
zp_is_dir + 1);
- zp->z_phys->zp_links = zp_is_dir + 1;
+ zp->z_links = zp_is_dir + 1;
}
- if (--zp->z_phys->zp_links == zp_is_dir) {
+ if (--zp->z_links == zp_is_dir) {
zp->z_unlinked = B_TRUE;
- zp->z_phys->zp_links = 0;
+ zp->z_links = 0;
unlinked = B_TRUE;
} else {
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
}
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT(error == 0);
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
+ } else {
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0)
+ return (error);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size--; /* one dirent removed */
- dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- if (zp->z_zfsvfs->z_norm) {
- if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
- (flag & ZCIEXACT)) ||
- ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(flag & ZCILOOK)))
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_EXACT, tx);
- else
- error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_FIRST, tx);
- } else {
- error = zap_remove(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, tx);
- }
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -814,7 +908,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
boolean_t
zfs_dirempty(znode_t *dzp)
{
- return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+ return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
}
int
@@ -826,6 +920,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
int error;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t parent;
*xvpp = NULL;
@@ -846,28 +941,39 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
return (EDQUOT);
}
+top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
- if (error == ERESTART)
+ if (error == ERESTART) {
dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
return (error);
}
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- ASSERT(xzp->z_phys->zp_parent == zp->z_id);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_xattr = xzp->z_id;
+#ifdef DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
xzp, "", NULL, acl_ids.z_fuidp, vap);
@@ -912,7 +1018,6 @@ top:
return (0);
}
- ASSERT(zp->z_phys->zp_xattr == 0);
if (!(flags & CREATE_XATTR_DIR)) {
zfs_dirent_unlock(dl);
@@ -980,11 +1085,11 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
if (zdp->z_zfsvfs->z_replay)
return (0);
- if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+ if ((zdp->z_mode & S_ISVTX) == 0)
return (0);
- downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
- fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
(ZTOV(zp)->v_type == VREG &&
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
index 4b27ec324c9c..0b4812666442 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -28,16 +28,12 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/fm/fs/zfs.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>
-
-#ifdef _KERNEL
-/* Including sys/bus.h is just too hard, so I declare what I need here. */
-extern void devctl_notify(const char *__system, const char *__subsystem,
- const char *__type, const char *__data);
-#endif
+#include <sys/sysevent.h>
/*
* This general routine is responsible for generating all the different ZFS
@@ -92,21 +88,32 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
* this pointer is set to NULL, and no ereport will be generated (since it
* doesn't actually correspond to any particular device or piece of data,
* and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs. Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO. When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
*/
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+#ifdef _KERNEL
+static void
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+ const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
uint64_t stateoroffset, uint64_t size)
{
-#ifdef _KERNEL
- char buf[1024];
- struct sbuf sb;
- struct timespec ts;
- int error;
+ nvlist_t *ereport, *detector;
+
+ uint64_t ena;
+ char class[64];
/*
- * If we are doing a spa_tryimport(), ignore errors.
+ * If we are doing a spa_tryimport() or in recovery mode,
+ * ignore errors.
*/
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER)
return;
/*
@@ -114,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* failed, don't bother logging any new ereports - we're just going to
* get the same diagnosis anyway.
*/
- if (spa->spa_load_state != SPA_LOAD_NONE &&
+ if (spa_load_state(spa) != SPA_LOAD_NONE &&
spa->spa_last_open_failed)
return;
@@ -153,9 +160,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* not yet been asynchronously placed into the REMOVED
* state.
*/
- if (zio->io_vd == vd &&
- !vdev_accessible(vd, zio) &&
- strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+ if (zio->io_vd == vd && !vdev_accessible(vd, zio))
return;
/*
@@ -169,51 +174,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
return;
}
}
- nanotime(&ts);
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+ /*
+ * For probe failure, we want to avoid posting ereports if we've
+ * already removed the device in the meantime.
+ */
+ if (vd != NULL &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+ return;
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return;
+ }
/*
* Serialize ereport generation
*/
mutex_enter(&spa->spa_errlist_lock);
-#if 0
/*
* Determine the ENA to use for this event. If we are in a loading
* state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
* a root zio-wide ENA. Otherwise, simply use a unique ENA.
*/
- if (spa->spa_load_state != SPA_LOAD_NONE) {
-#if 0
+ if (spa_load_state(spa) != SPA_LOAD_NONE) {
if (spa->spa_ena == 0)
spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
-#endif
ena = spa->spa_ena;
} else if (zio != NULL && zio->io_logical != NULL) {
-#if 0
if (zio->io_logical->io_ena == 0)
zio->io_logical->io_ena =
fm_ena_generate(0, FM_ENA_FMT1);
-#endif
ena = zio->io_logical->io_ena;
} else {
-#if 0
ena = fm_ena_generate(0, FM_ENA_FMT1);
-#else
- ena = 0;
-#endif
}
-#endif
/*
* Construct the full class, detector, and other standard FMA fields.
*/
- sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
- sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
- sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
/*
* Construct the per-ereport payload, depending on which parameters are
@@ -223,51 +234,57 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
/*
* Generic payload members common to all ereports.
*/
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
- sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
- spa->spa_load_state);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+ DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ spa_load_state(spa), NULL);
if (spa != NULL) {
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ DATA_TYPE_STRING,
spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
FM_EREPORT_FAILMODE_WAIT :
spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
- FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC);
+ FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+ NULL);
}
if (vd != NULL) {
vdev_t *pvd = vd->vdev_parent;
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
- vd->vdev_ops->vdev_op_type);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
if (vd->vdev_path != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
if (vd->vdev_devid != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
if (vd->vdev_fru != NULL)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+ DATA_TYPE_STRING, vd->vdev_fru, NULL);
if (pvd != NULL) {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
- pvd->vdev_ops->vdev_op_type);
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
if (pvd->vdev_path)
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
- pvd->vdev_path);
+ DATA_TYPE_STRING, pvd->vdev_path, NULL);
if (pvd->vdev_devid)
- sbuf_printf(&sb, " %s=%s",
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
- pvd->vdev_devid);
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
}
}
@@ -275,8 +292,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
/*
* Payload common to all I/Os.
*/
- sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
- zio->io_error);
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
/*
* If the 'size' parameter is non-zero, it indicates this is a
@@ -284,52 +301,500 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* provided for us, instead of within the zio_t.
*/
if (vd != NULL) {
- if (size) {
- sbuf_printf(&sb, " %s=%ju",
+ if (size)
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- stateoroffset);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
- } else {
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size, NULL);
+ else
+ fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- zio->io_offset);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64, zio->io_offset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
- zio->io_size);
- }
+ DATA_TYPE_UINT64, zio->io_size, NULL);
}
/*
* Payload for I/Os with corresponding logical information.
*/
- if (zio->io_logical != NULL) {
- sbuf_printf(&sb, " %s=%ju",
+ if (zio->io_logical != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_objset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
- zio->io_logical->io_bookmark.zb_object);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_object,
FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
- zio->io_logical->io_bookmark.zb_level);
- sbuf_printf(&sb, " %s=%ju",
+ DATA_TYPE_INT64,
+ zio->io_logical->io_bookmark.zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
- zio->io_logical->io_bookmark.zb_blkid);
- }
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_blkid, NULL);
} else if (vd != NULL) {
/*
* If we have a vdev but no zio, this is a device fault, and the
* 'stateoroffset' parameter indicates the previous state of the
* vdev.
*/
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
- stateoroffset);
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
}
+
mutex_exit(&spa->spa_errlist_lock);
- error = sbuf_finish(&sb);
- devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
- if (error != 0)
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
+ *ereport_out = ereport;
+ *detector_out = detector;
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
+
+#define MAX_RANGES 16
+
+typedef struct zfs_ecksum_info {
+ /* histograms of set and cleared bits by bit number in a 64-bit word */
+ uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+ uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+ /* inline arrays of bits set and cleared. */
+ uint64_t zei_bits_set[ZFM_MAX_INLINE];
+ uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+ /*
+ * for each range, the number of bits set and cleared. The Hamming
+ * distance between the good and bad buffers is the sum of them all.
+ */
+ uint32_t zei_range_sets[MAX_RANGES];
+ uint32_t zei_range_clears[MAX_RANGES];
+
+ struct zei_ranges {
+ uint32_t zr_start;
+ uint32_t zr_end;
+ } zei_ranges[MAX_RANGES];
+
+ size_t zei_range_count;
+ uint32_t zei_mingap;
+ uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+{
+ size_t i;
+ size_t bits = 0;
+ uint64_t value = BE_64(value_arg);
+
+ /* We store the bits in big-endian (largest-first) order */
+ for (i = 0; i < 64; i++) {
+ if (value & (1ull << i)) {
+ hist[63 - i]++;
+ ++bits;
+ }
+ }
+ /* update the count of bits changed */
+ *count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly. zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that. We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one. We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+shrink_ranges(zfs_ecksum_info_t *eip)
+{
+ uint32_t mingap = UINT32_MAX;
+ uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+ size_t idx, output;
+ size_t max = eip->zei_range_count;
+
+ struct zei_ranges *r = eip->zei_ranges;
+
+ ASSERT3U(eip->zei_range_count, >, 0);
+ ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+ output = idx = 0;
+ while (idx < max - 1) {
+ uint32_t start = r[idx].zr_start;
+ uint32_t end = r[idx].zr_end;
+
+ while (idx < max - 1) {
+ idx++;
+
+ uint32_t nstart = r[idx].zr_start;
+ uint32_t nend = r[idx].zr_end;
+
+ uint32_t gap = nstart - end;
+ if (gap < new_allowed_gap) {
+ end = nend;
+ continue;
+ }
+ if (gap < mingap)
+ mingap = gap;
+ break;
+ }
+ r[output].zr_start = start;
+ r[output].zr_end = end;
+ output++;
+ }
+ ASSERT3U(output, <, eip->zei_range_count);
+ eip->zei_range_count = output;
+ eip->zei_mingap = mingap;
+ eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+
+ if (count >= MAX_RANGES) {
+ shrink_ranges(eip);
+ count = eip->zei_range_count;
+ }
+ if (count == 0) {
+ eip->zei_mingap = UINT32_MAX;
+ eip->zei_allowed_mingap = 1;
+ } else {
+ int gap = start - r[count - 1].zr_end;
+
+ if (gap < eip->zei_allowed_mingap) {
+ r[count - 1].zr_end = end;
+ return;
+ }
+ if (gap < eip->zei_mingap)
+ eip->zei_mingap = gap;
+ }
+ r[count].zr_start = start;
+ r[count].zr_end = end;
+ eip->zei_range_count++;
+}
+
+static size_t
+range_total_size(zfs_ecksum_info_t *eip)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+ size_t result = 0;
+ size_t idx;
+
+ for (idx = 0; idx < count; idx++)
+ result += (r[idx].zr_end - r[idx].zr_start);
+
+ return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+ const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
+ boolean_t drop_if_identical)
+{
+ const uint64_t *good = (const uint64_t *)goodbuf;
+ const uint64_t *bad = (const uint64_t *)badbuf;
+
+ uint64_t allset = 0;
+ uint64_t allcleared = 0;
+
+ size_t nui64s = size / sizeof (uint64_t);
+
+ size_t inline_size;
+ int no_inline = 0;
+ size_t idx;
+ size_t range;
+
+ size_t offset = 0;
+ ssize_t start = -1;
+
+ zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+ /* don't do any annotation for injected checksum errors */
+ if (info != NULL && info->zbc_injected)
+ return (eip);
+
+ if (info != NULL && info->zbc_has_cksum) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_expected) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_expected,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_actual) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_actual,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+ DATA_TYPE_STRING,
+ info->zbc_checksum_name,
+ NULL);
+
+ if (info->zbc_byteswapped) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+ DATA_TYPE_BOOLEAN, 1,
+ NULL);
+ }
+ }
+
+ if (badbuf == NULL || goodbuf == NULL)
+ return (eip);
+
+ ASSERT3U(nui64s, <=, UINT16_MAX);
+ ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, <=, UINT32_MAX);
+
+ /* build up the range list by comparing the two buffers. */
+ for (idx = 0; idx < nui64s; idx++) {
+ if (good[idx] == bad[idx]) {
+ if (start == -1)
+ continue;
+
+ add_range(eip, start, idx);
+ start = -1;
+ } else {
+ if (start != -1)
+ continue;
+
+ start = idx;
+ }
+ }
+ if (start != -1)
+ add_range(eip, start, idx);
+
+ /* See if it will fit in our inline buffers */
+ inline_size = range_total_size(eip);
+ if (inline_size > ZFM_MAX_INLINE)
+ no_inline = 1;
+
+ /*
+ * If there is no change and we want to drop if the buffers are
+ * identical, do so.
+ */
+ if (inline_size == 0 && drop_if_identical) {
+ kmem_free(eip, sizeof (*eip));
+ return (NULL);
+ }
+
+ /*
+ * Now walk through the ranges, filling in the details of the
+ * differences. Also convert our uint64_t-array offsets to byte
+ * offsets.
+ */
+ for (range = 0; range < eip->zei_range_count; range++) {
+ size_t start = eip->zei_ranges[range].zr_start;
+ size_t end = eip->zei_ranges[range].zr_end;
+
+ for (idx = start; idx < end; idx++) {
+ uint64_t set, cleared;
+
+ // bits set in bad, but not in good
+ set = ((~good[idx]) & bad[idx]);
+ // bits set in good, but not in bad
+ cleared = (good[idx] & (~bad[idx]));
+
+ allset |= set;
+ allcleared |= cleared;
+
+ if (!no_inline) {
+ ASSERT3U(offset, <, inline_size);
+ eip->zei_bits_set[offset] = set;
+ eip->zei_bits_cleared[offset] = cleared;
+ offset++;
+ }
+
+ update_histogram(set, eip->zei_histogram_set,
+ &eip->zei_range_sets[range]);
+ update_histogram(cleared, eip->zei_histogram_cleared,
+ &eip->zei_range_clears[range]);
+ }
+
+ /* convert to byte offsets */
+ eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
+ eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
+ }
+ eip->zei_allowed_mingap *= sizeof (uint64_t);
+ inline_size *= sizeof (uint64_t);
+
+ /* fill in ereport */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+ DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+ (uint32_t *)eip->zei_ranges,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+ DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+ NULL);
+
+ if (!no_inline) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_cleared,
+ NULL);
+ } else {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+ DATA_TYPE_UINT16_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+ DATA_TYPE_UINT16_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+ NULL);
+ }
+ return (eip);
+}
+#endif
+
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+
+ zfs_ereport_start(&ereport, &detector,
+ subclass, spa, vd, zio, stateoroffset, size);
+
+ if (ereport == NULL)
+ return;
+
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+void
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+ zio_bad_cksum_t *info)
+{
+ zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+ if (zio->io_vsd != NULL)
+ zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+ else
+ zio_vsd_default_cksum_report(zio, report, arg);
+
+ /* copy the checksum failure information if it was provided */
+ if (info != NULL) {
+ report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+ bcopy(info, report->zcr_ckinfo, sizeof (*info));
+ }
+
+ report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_length = length;
+
+#ifdef _KERNEL
+ zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (report->zcr_ereport == NULL) {
+ report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+ kmem_free(report, sizeof (*report));
+ return;
+ }
+#endif
+
+ mutex_enter(&spa->spa_errlist_lock);
+ report->zcr_next = zio->io_logical->io_cksum_report;
+ zio->io_logical->io_cksum_report = report;
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+ zfs_ecksum_info_t *info = NULL;
+ info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+ good_data, bad_data, report->zcr_length, drop_if_identical);
+
+ if (info != NULL)
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
+ report->zcr_ereport = report->zcr_detector = NULL;
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+ if (rpt->zcr_ereport != NULL) {
+ fm_nvlist_destroy(rpt->zcr_ereport,
+ FM_NVA_FREE);
+ fm_nvlist_destroy(rpt->zcr_detector,
+ FM_NVA_FREE);
+ }
+#endif
+ rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+ if (rpt->zcr_ckinfo != NULL)
+ kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+ kmem_free(rpt, sizeof (*rpt));
+}
+
+void
+zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
+{
+#ifdef _KERNEL
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+#endif
+}
+
+void
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+ zfs_ecksum_info_t *info;
+
+ zfs_ereport_start(&ereport, &detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (ereport == NULL)
+ return;
+
+ info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+ B_FALSE);
+
+ if (info != NULL)
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
#endif
}
@@ -337,32 +802,28 @@ static void
zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
{
#ifdef _KERNEL
- char buf[1024];
+ nvlist_t *resource;
char class[64];
- struct sbuf sb;
- struct timespec ts;
- int error;
- nanotime(&ts);
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return;
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return;
- snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
ZFS_ERROR_CLASS, name);
- sbuf_printf(&sb, " %s=%d", FM_VERSION, FM_RSRC_VERSION);
- sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
+ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
if (vd)
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- error = sbuf_finish(&sb);
- ZFS_LOG(1, "%s", sbuf_data(&sb));
- devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
- if (error != 0)
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+ fm_ereport_post(resource, EVCH_SLEEP);
+
+ fm_nvlist_destroy(resource, FM_NVA_FREE);
#endif
}
@@ -388,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
{
zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
}
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
index 8090ec10f18a..5b54448aeedb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
-#include <sys/sunddi.h>
#include <sys/dmu.h>
#include <sys/avl.h>
#include <sys/zap.h>
@@ -377,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
- if (zfsvfs->z_fuid_obj)
+ if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
else
domain = nulldomain;
@@ -390,10 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
void
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
{
- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
- cr, ZFS_OWNER);
- *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
- cr, ZFS_GROUP);
+ *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
}
uid_t
@@ -418,9 +414,9 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
(void) kidmap_getgidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
}
-#else /* sun */
+#else /* !sun */
id = UID_NOBODY;
-#endif /* sun */
+#endif /* !sun */
return (id);
}
@@ -431,7 +427,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
* If ACL has multiple domains, then keep only one copy of each unique
* domain.
*/
-static void
+void
zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
uint64_t idx, uint64_t id, zfs_fuid_type_t type)
{
@@ -492,6 +488,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
/*
* Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
*/
uint64_t
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
@@ -506,24 +507,31 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
- if (type == ZFS_OWNER)
- id = crgetuid(cr);
- else
- id = crgetgid(cr);
+ ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+ if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+ if (IS_EPHEMERAL(id))
+ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
- if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
return ((uint64_t)id);
+ }
-#ifdef sun
- ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+ /*
+ * ksid is present and FUID is supported
+ */
+ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+ if (!IS_EPHEMERAL(id))
+ return ((uint64_t)id);
+
+ if (type == ZFS_GROUP)
+ id = ksid_getid(ksid);
- VERIFY(ksid != NULL);
rid = ksid_getrid(ksid);
domain = ksid_getdomain(ksid);
-#else /* sun */
- rid = UID_NOBODY;
- domain = nulldomain;
-#endif /* sun */
+
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
@@ -597,7 +605,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
};
domain = fuidp->z_domain_table[idx -1];
} else {
-#ifdef sun
if (type == ZFS_OWNER || type == ZFS_ACE_USER)
status = kidmap_getsidbyuid(crgetzone(cr), id,
&domain, &rid);
@@ -606,7 +613,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
&domain, &rid);
if (status != 0) {
-#endif /* sun */
/*
* When returning nobody we will need to
* make a dummy fuid table entry for logging
@@ -614,9 +620,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
*/
rid = UID_NOBODY;
domain = nulldomain;
-#ifdef sun
}
-#endif /* sun */
}
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
@@ -699,18 +703,16 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
#ifdef sun
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
ksidlist_t *ksidlist = crgetsidlist(cr);
-#endif /* sun */
+#endif /* !sun */
uid_t gid;
#ifdef sun
if (ksid && ksidlist) {
int i;
ksid_t *ksid_groups;
- ksidlist_t *ksidlist = crgetsidlist(cr);
uint32_t idx = FUID_INDEX(id);
uint32_t rid = FUID_RID(id);
- ASSERT(ksidlist);
ksid_groups = ksidlist->ksl_sids;
for (i = 0; i != ksidlist->ksl_nsid; i++) {
@@ -736,7 +738,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
}
}
}
-#endif /* sun */
+#endif /* !sun */
/*
* Not found in ksidlist, check posix groups
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 9a68adffbd6b..52300ee442e1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -47,7 +46,6 @@
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
#include <sys/dmu.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
@@ -65,14 +63,18 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
#include <sys/dmu_objset.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+#include "zfs_ioctl_compat.h"
-CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
+CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
static struct cdev *zfsdev;
@@ -105,17 +107,22 @@ static const char *userquota_perms[] = {
};
static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
-static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+ cred_t *cr);
+static int zfs_check_clearable(char *dataset, nvlist_t *props,
+ nvlist_t **errors);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *);
-int zfs_set_prop_nvlist(const char *, nvlist_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **);
+
+static void zfsdev_close(void *data);
/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
void
__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
- char buf[256];
+ char buf[512];
va_list adx;
/*
@@ -178,22 +185,15 @@ history_str_get(zfs_cmd_t *zc)
static boolean_t
zfs_is_bootfs(const char *name)
{
- spa_t *spa;
- boolean_t ret = B_FALSE;
-
- if (spa_open(name, &spa, FTAG) == 0) {
- if (spa->spa_bootfs) {
- objset_t *os;
+ objset_t *os;
- if (dmu_objset_open(name, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- ret = (dmu_objset_id(os) == spa->spa_bootfs);
- dmu_objset_close(os);
- }
- }
- spa_close(spa, FTAG);
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
+ boolean_t ret;
+ ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
+ dmu_objset_rele(os, FTAG);
+ return (ret);
}
- return (ret);
+ return (B_FALSE);
}
/*
@@ -227,13 +227,17 @@ zpl_earlier_version(const char *name, int version)
objset_t *os;
boolean_t rc = B_TRUE;
- if (dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
uint64_t zplversion;
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (B_TRUE);
+ }
+ /* XXX reading from non-owned objset */
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
rc = zplversion < version;
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
return (rc);
}
@@ -282,9 +286,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
}
static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
{
- uint64_t zoned;
int writable = 1;
/*
@@ -295,9 +298,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
!zone_dataset_visible(dataset, &writable))
return (ENOENT);
- if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
- return (ENOENT);
-
if (INGLOBALZONE(curthread)) {
/*
* If the fs is zoned, only root can access it from the
@@ -319,6 +319,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
return (0);
}
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+ return (ENOENT);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+ uint64_t zoned;
+
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ if (dsl_prop_get_ds(ds, "jailed", 8, 1, &zoned, NULL)) {
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ return (ENOENT);
+ }
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
int
zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
{
@@ -333,9 +359,126 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
return (error);
}
+int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+ const char *perm, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck_ds(name, ds, cr);
+ if (error == 0) {
+ error = secpolicy_zfs(cr);
+ if (error)
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ }
+ return (error);
+}
+
+#ifdef SECLABEL
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
static int
-zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
+zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
{
+ char ds_hexsl[MAXNAMELEN];
+ bslabel_t ds_sl, new_sl;
+ boolean_t new_default = FALSE;
+ uint64_t zoned;
+ int needed_priv = -1;
+ int error;
+
+ /* First get the existing dataset label. */
+ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (EPERM);
+
+ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+ new_default = TRUE;
+
+ /* The label must be translatable */
+ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+ return (EINVAL);
+
+ /*
+ * In a non-global zone, disallow attempts to set a label that
+ * doesn't match that of the zone; otherwise no other checks
+ * are needed.
+ */
+ if (!INGLOBALZONE(curproc)) {
+ if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+ return (EPERM);
+ return (0);
+ }
+
+ /*
+ * For global-zone datasets (i.e., those whose zoned property is
+ * "off", verify that the specified new label is valid for the
+ * global zone.
+ */
+ if (dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (EPERM);
+ if (!zoned) {
+ if (zfs_check_global_label(name, strval) != 0)
+ return (EPERM);
+ }
+
+ /*
+ * If the existing dataset label is nondefault, check if the
+ * dataset is mounted (label cannot be changed while mounted).
+ * Get the zfsvfs; if there isn't one, then the dataset isn't
+ * mounted (or isn't a dataset, doesn't exist, ...).
+ */
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+ objset_t *os;
+ static char *setsl_tag = "setsl_tag";
+
+ /*
+ * Try to own the dataset; abort if there is any error,
+ * (e.g., already mounted, in use, or other error).
+ */
+ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+ setsl_tag, &os);
+ if (error)
+ return (EPERM);
+
+ dmu_objset_disown(os, setsl_tag);
+
+ if (new_default) {
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ goto out_check;
+ }
+
+ if (hexstr_to_label(strval, &new_sl) != 0)
+ return (EPERM);
+
+ if (blstrictdom(&ds_sl, &new_sl))
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ else if (blstrictdom(&new_sl, &ds_sl))
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ } else {
+ /* dataset currently has a default label */
+ if (!new_default)
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ }
+
+out_check:
+ if (needed_priv != -1)
+ return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+ return (0);
+}
+#endif /* SECLABEL */
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+ cred_t *cr)
+{
+ char *strval;
+
/*
* Check permissions for special properties.
*/
@@ -357,16 +500,33 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
* quota on things *under* (ie. contained by)
* the thing they own.
*/
- if (dsl_prop_get_integer(name, "jailed", &zoned,
+ if (dsl_prop_get_integer(dsname, "jailed", &zoned,
setpoint))
return (EPERM);
- if (!zoned || strlen(name) <= strlen(setpoint))
+ if (!zoned || strlen(dsname) <= strlen(setpoint))
return (EPERM);
}
break;
+
+ case ZFS_PROP_MLSLABEL:
+#ifdef SECLABEL
+ if (!is_system_labeled())
+ return (EPERM);
+
+ if (nvpair_value_string(propval, &strval) == 0) {
+ int err;
+
+ err = zfs_set_slabel_policy(dsname, strval, CRED());
+ if (err != 0)
+ return (err);
+ }
+#else
+ return (EOPNOTSUPP);
+#endif
+ break;
}
- return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
+ return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
}
int
@@ -388,20 +548,45 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
int
zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
{
- int error;
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_ROLLBACK, cr);
- if (error == 0)
- error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr);
- return (error);
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_ROLLBACK, cr));
}
int
zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
{
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_SEND, cr));
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ char *cp;
+ int error;
+
+ /*
+ * Generate the current snapshot name from the given objsetid, then
+ * use that name for the secpolicy/zone checks.
+ */
+ cp = strchr(zc->zc_name, '@');
+ if (cp == NULL)
+ return (EINVAL);
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ dsl_dataset_name(ds, zc->zc_name);
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
}
static int
@@ -495,19 +680,34 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
}
/*
- * Must have sys_config privilege to check the iscsi permission
+ * Destroying snapshots with delegated permissions requires
+ * descendent mount and destroy permissions.
+ * Reassemble the full filesystem@snap name so dsl_deleg_access()
+ * can do the correct permission check.
+ *
+ * Since this routine is used when doing a recursive destroy of snapshots
+ * and destroying snapshots requires descendent permissions, a successfull
+ * check of the top level snapshot applies to snapshots of all descendent
+ * datasets as well.
*/
-/* ARGSUSED */
static int
-zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
{
- return (secpolicy_zfs(cr));
+ int error;
+ char *dsname;
+
+ dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+
+ error = zfs_secpolicy_destroy_perms(dsname, cr);
+
+ strfree(dsname);
+ return (error);
}
int
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[MAXNAMELEN];
int error;
if ((error = zfs_secpolicy_write_perms(from,
@@ -542,7 +742,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[MAXNAMELEN];
objset_t *clone;
int error;
@@ -551,20 +751,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
if (error)
return (error);
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
if (error == 0) {
dsl_dataset_t *pclone = NULL;
dsl_dir_t *dd;
- dd = clone->os->os_dsl_dataset->ds_dir;
+ dd = clone->os_dsl_dataset->ds_dir;
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
error = dsl_dataset_hold_obj(dd->dd_pool,
dd->dd_phys->dd_origin_obj, FTAG, &pclone);
rw_exit(&dd->dd_pool->dp_config_rwlock);
if (error) {
- dmu_objset_close(clone);
+ dmu_objset_rele(clone, FTAG);
return (error);
}
@@ -572,7 +771,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
ZFS_DELEG_PERM_MOUNT, cr);
dsl_dataset_name(pclone, parentname);
- dmu_objset_close(clone);
+ dmu_objset_rele(clone, FTAG);
dsl_dataset_rele(pclone, FTAG);
if (error == 0)
error = zfs_secpolicy_write_perms(parentname,
@@ -601,16 +800,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
int
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
{
- int error;
-
- if ((error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
- return (error);
-
- error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_MOUNT, cr);
-
- return (error);
+ return (zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_SNAPSHOT, cr));
}
static int
@@ -623,8 +814,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
{
- char parentname[MAXNAMELEN];
- int error;
+ char parentname[MAXNAMELEN];
+ int error;
if ((error = zfs_get_parent(zc->zc_name, parentname,
sizeof (parentname))) != 0)
@@ -673,19 +864,19 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
}
/*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
+ * Policy for object to name lookups.
*/
+/* ARGSUSED */
static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr)
{
- if (secpolicy_sys_config(cr, B_FALSE) != 0) {
- return (dsl_deleg_access(zc->zc_name,
- ZFS_DELEG_PERM_MOUNT, cr));
- }
+ int error;
- return (0);
+ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+ return (error);
}
/*
@@ -709,28 +900,11 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_USERPROP, cr));
} else {
- if (!zfs_prop_inheritable(prop))
- return (EINVAL);
- return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
+ return (zfs_secpolicy_setprop(zc->zc_name, prop,
+ NULL, cr));
}
}
-/*
- * Policy for dataset backup operations (sendbackup).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-static int
-zfs_secpolicy_operator(const char *dataset, cred_t *cr)
-{
- int writable = 1;
-
- if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable))
- return (ENOENT);
- if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
- return (EPERM);
- return (0);
-}
-
static int
zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
{
@@ -777,14 +951,56 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
static int
zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
{
- return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
+ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+ NULL, cr));
+}
+
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_HOLD, cr));
+}
+
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RELEASE, cr));
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr)
+{
+ /*
+ * A temporary snapshot is the same as a snapshot,
+ * hold, destroy and release all rolled into one.
+ * Delegated diff alone is sufficient that we allow this.
+ */
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_snapshot(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_hold(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_release(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_destroy(zc, cr);
+ return (error);
}
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
-get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
{
char *packed;
int error;
@@ -798,7 +1014,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
packed = kmem_alloc(size, KM_SLEEP);
- if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
kmem_free(packed, size);
return (error);
}
@@ -815,11 +1032,46 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
}
static int
+fit_error_list(zfs_cmd_t *zc, nvlist_t **errors)
+{
+ size_t size;
+
+ VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ nvpair_t *more_errors;
+ int n = 0;
+
+ if (zc->zc_nvlist_dst_size < 1024)
+ return (ENOMEM);
+
+ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0);
+ more_errors = nvlist_prev_nvpair(*errors, NULL);
+
+ do {
+ nvpair_t *pair = nvlist_prev_nvpair(*errors,
+ more_errors);
+ VERIFY(nvlist_remove_nvpair(*errors, pair) == 0);
+ n++;
+ VERIFY(nvlist_size(*errors, &size,
+ NV_ENCODE_NATIVE) == 0);
+ } while (size > zc->zc_nvlist_dst_size);
+
+ VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0);
+ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0);
+ ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+ ASSERT(size <= zc->zc_nvlist_dst_size);
+ }
+
+ return (0);
+}
+
+static int
put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
{
char *packed = NULL;
+ int error = 0;
size_t size;
- int error;
VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
@@ -837,8 +1089,9 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
packed = kmem_alloc(size, KM_SLEEP);
VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
KM_SLEEP) == 0);
- error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
- size);
+ if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size, zc->zc_iflags) != 0)
+ error = EFAULT;
kmem_free(packed, size);
}
@@ -847,25 +1100,28 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
}
static int
-getzfsvfs(const char *dsname, zfsvfs_t **zvp)
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
{
objset_t *os;
int error;
- error = dmu_objset_open(dsname, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &os);
+ error = dmu_objset_hold(dsname, FTAG, &os);
if (error)
return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
- mutex_enter(&os->os->os_user_ptr_lock);
- *zvp = dmu_objset_get_user(os);
- if (*zvp) {
- VFS_HOLD((*zvp)->z_vfs);
+ mutex_enter(&os->os_user_ptr_lock);
+ *zfvp = dmu_objset_get_user(os);
+ if (*zfvp) {
+ VFS_HOLD((*zfvp)->z_vfs);
} else {
error = ESRCH;
}
- mutex_exit(&os->os->os_user_ptr_lock);
- dmu_objset_close(os);
+ mutex_exit(&os->os_user_ptr_lock);
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -874,22 +1130,22 @@ getzfsvfs(const char *dsname, zfsvfs_t **zvp)
* case its z_vfs will be NULL, and it will be opened as the owner.
*/
static int
-zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
{
int error = 0;
- int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0);
- if (getzfsvfs(name, zvp) != 0)
- error = zfsvfs_create(name, mode, zvp);
+ if (getzfsvfs(name, zfvp) != 0)
+ error = zfsvfs_create(name, zfvp);
if (error == 0) {
- rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag);
- if ((*zvp)->z_unmounted) {
+ rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+ RW_READER, tag);
+ if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
* thread should be just about to disassociate the
* objset from the zfsvfs.
*/
- rrw_exit(&(*zvp)->z_teardown_lock, tag);
+ rrw_exit(&(*zfvp)->z_teardown_lock, tag);
return (EBUSY);
}
}
@@ -904,7 +1160,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
if (zfsvfs->z_vfs) {
VFS_RELE(zfsvfs->z_vfs);
} else {
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
@@ -919,11 +1175,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
char *buf;
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config))
+ zc->zc_iflags, &config))
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -962,8 +1219,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
/*
* Set the remaining root properties
*/
- if (!error &&
- (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0)
+ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
+ ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
(void) spa_destroy(zc->zc_name);
if (buf != NULL)
@@ -984,22 +1241,25 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
int error;
zfs_log_history(zc);
error = spa_destroy(zc->zc_name);
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
return (error);
}
static int
zfs_ioc_pool_import(zfs_cmd_t *zc)
{
- int error;
nvlist_t *config, *props = NULL;
uint64_t guid;
+ int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) != 0)
+ zc->zc_iflags, &config)) != 0)
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -1007,11 +1267,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != zc->zc_guid)
error = EINVAL;
- else if (zc->zc_cookie)
- error = spa_import_verbatim(zc->zc_name, config,
- props);
else
- error = spa_import(zc->zc_name, config, props);
+ error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+ if (zc->zc_nvlist_dst != 0) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ }
nvlist_free(config);
@@ -1030,6 +1294,8 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force, hardforce);
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
return (error);
}
@@ -1087,7 +1353,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &tryconfig)) != 0)
+ zc->zc_iflags, &tryconfig)) != 0)
return (error);
config = spa_tryimport(tryconfig);
@@ -1103,8 +1369,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_cookie scan func (pool_scan_func_t)
+ */
static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
@@ -1112,7 +1383,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
- error = spa_scrub(spa, zc->zc_cookie);
+ if (zc->zc_cookie == POOL_SCAN_NONE)
+ error = spa_scan_stop(spa);
+ else
+ error = spa_scan(spa, zc->zc_cookie);
spa_close(spa, FTAG);
@@ -1175,9 +1449,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
hist_buf = kmem_alloc(size, KM_SLEEP);
if ((error = spa_history_get(spa, &zc->zc_history_offset,
&zc->zc_history_len, hist_buf)) == 0) {
- error = xcopyout(hist_buf,
- (char *)(uintptr_t)zc->zc_history,
- zc->zc_history_len);
+ error = ddi_copyout(hist_buf,
+ (void *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len, zc->zc_iflags);
}
spa_close(spa, FTAG);
@@ -1196,18 +1470,59 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
return (0);
}
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_value name of object
+ */
static int
zfs_ioc_obj_to_path(zfs_cmd_t *zc)
{
- objset_t *osp;
+ objset_t *os;
int error;
- if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
- DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0)
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
return (error);
- error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
+ error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
- dmu_objset_close(osp);
+ dmu_objset_rele(os, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_stat stats on object
+ * zc_value path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
+ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -1217,20 +1532,15 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
-#ifdef sun
nvlist_t *config, **l2cache, **spares;
uint_t nl2cache = 0, nspares = 0;
-#else
- nvlist_t *config;
-#endif
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config);
-#ifdef sun
+ zc->zc_iflags, &config);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache);
@@ -1247,11 +1557,11 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
*
* l2cache and spare devices are ok to be added to a rootpool.
*/
- if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
+ nvlist_free(config);
spa_close(spa, FTAG);
return (EDOM);
}
-#endif
if (error == 0) {
error = spa_vdev_add(spa, config);
@@ -1261,6 +1571,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_nvlist_conf nvlist of devices to remove
+ * zc_cookie to stop the remove?
+ */
static int
zfs_ioc_vdev_remove(zfs_cmd_t *zc)
{
@@ -1294,11 +1610,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
break;
case VDEV_STATE_FAULTED:
- error = vdev_fault(spa, zc->zc_guid);
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
break;
case VDEV_STATE_DEGRADED:
- error = vdev_degrade(spa, zc->zc_guid);
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
default:
@@ -1321,7 +1645,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
return (error);
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) == 0) {
+ zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
@@ -1346,6 +1670,41 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
}
static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ nvlist_t *config, *props = NULL;
+ int error;
+ boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ spa_close(spa, FTAG);
+ nvlist_free(config);
+ return (error);
+ }
+
+ error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
{
spa_t *spa;
@@ -1379,6 +1738,35 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
return (error);
}
+static int
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_USER hold, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ * XXX reading with out owning
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent) {
+ if (dmu_objset_type(os) == DMU_OST_ZVOL)
+ VERIFY(zvol_get_stats(os, nv) == 0);
+ }
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
/*
* inputs:
* zc_name name of filesystem
@@ -1394,34 +1782,59 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os = NULL;
int error;
+
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (error);
+
+ error = zfs_ioc_objset_stats_impl(zc, os);
+
+ dmu_objset_rele(os, FTAG);
+
+ if (error == ENOMEM)
+ error = 0;
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst received property nvlist
+ * zc_nvlist_dst_size size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
nvlist_t *nv;
- if (error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (error);
- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+ /*
+ * Without this check, we would return local property values if the
+ * caller has not already received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ if (!dsl_prop_get_hasrecvd(os)) {
+ dmu_objset_rele(os, FTAG);
+ return (ENOTSUP);
+ }
if (zc->zc_nvlist_dst != 0 &&
- (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) {
- dmu_objset_stats(os, nv);
- /*
- * NB: zvol_get_stats() will read the objset contents,
- * which we aren't supposed to do with a
- * DS_MODE_USER hold, because it could be
- * inconsistent. So this is a bit of a workaround...
- */
- if (!zc->zc_objset_stats.dds_inconsistent) {
- if (dmu_objset_type(os) == DMU_OST_ZVOL)
- VERIFY(zvol_get_stats(os, nv) == 0);
- }
+ (error = dsl_prop_get_received(os, &nv)) == 0) {
error = put_nvlist(zc, nv);
nvlist_free(nv);
}
- dmu_objset_close(os);
- if (error == ENOMEM)
- error = 0;
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -1456,8 +1869,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
objset_t *os;
int err;
- if (err = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+ /* XXX reading without owning */
+ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
return (err);
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
@@ -1482,11 +1895,11 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
} else {
err = ENOENT;
}
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (err);
}
-static boolean_t
+boolean_t
dataset_name_hidden(const char *name)
{
/*
@@ -1522,9 +1935,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
objset_t *os;
int error;
char *p;
+ size_t orig_len = strlen(zc->zc_name);
- if (error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) {
+top:
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
if (error == ENOENT)
error = ESRCH;
return (error);
@@ -1544,7 +1958,7 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
int len = sizeof (zc->zc_name) - (p - zc->zc_name);
while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
- (void) dmu_objset_prefetch(p, NULL);
+ (void) dmu_objset_prefetch(zc->zc_name, NULL);
}
do {
@@ -1553,12 +1967,22 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
NULL, &zc->zc_cookie);
if (error == ENOENT)
error = ESRCH;
- } while (error == 0 && dataset_name_hidden(zc->zc_name));
- dmu_objset_close(os);
+ } while (error == 0 && dataset_name_hidden(zc->zc_name) &&
+ !(zc->zc_iflags & FKIOCTL));
+ dmu_objset_rele(os, FTAG);
- if (error == 0)
+ /*
+ * If it's an internal dataset (ie. with a '$' in its name),
+ * don't try to get stats for it, otherwise we'll return ENOENT.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
+ if (error == ENOENT) {
+ /* We lost a race with destroy, get the next one. */
+ zc->zc_name[orig_len] = '\0';
+ goto top;
+ }
+ }
return (error);
}
@@ -1580,299 +2004,363 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
objset_t *os;
int error;
- error = dmu_objset_open(zc->zc_name,
- DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
+top:
+ if (zc->zc_cookie == 0)
+ (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+ NULL, DS_FIND_SNAPSHOTS);
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error)
return (error == ENOENT ? ESRCH : error);
- if (zc->zc_cookie == 0) {
- (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
- NULL, DS_FIND_SNAPSHOTS);
- }
/*
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
*/
if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (ESRCH);
}
error = dmu_snapshot_list_next(os,
sizeof (zc->zc_name) - strlen(zc->zc_name),
- zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
- dmu_objset_close(os);
- if (error == 0)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
- else if (error == ENOENT)
+ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
+ NULL);
+
+ if (error == 0) {
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+
+ /*
+ * Since we probably don't have a hold on this snapshot,
+ * it's possible that the objsetid could have been destroyed
+ * and reused for a new objset. It's OK if this happens during
+ * a zfs send operation, since the new createtxg will be
+ * beyond the range we're interested in.
+ */
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ if (error == ENOENT) {
+ /* Racing with destroy, get the next one. */
+ *strchr(zc->zc_name, '@') = '\0';
+ dmu_objset_rele(os, FTAG);
+ goto top;
+ }
+ } else {
+ objset_t *ossnap;
+
+ error = dmu_objset_from_ds(ds, &ossnap);
+ if (error == 0)
+ error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ } else if (error == ENOENT) {
error = ESRCH;
+ }
+ dmu_objset_rele(os, FTAG);
/* if we failed, undo the @ that we tacked on to zc_name */
if (error)
*strchr(zc->zc_name, '@') = '\0';
return (error);
}
-int
-zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
{
- nvpair_t *elem;
- int error = 0;
- uint64_t intval;
- char *strval;
- nvlist_t *genericnvl;
- boolean_t issnap = (strchr(name, '@') != NULL);
+ const char *propname = nvpair_name(pair);
+ uint64_t *valary;
+ unsigned int vallen;
+ const char *domain;
+ char *dash;
+ zfs_userquota_prop_t type;
+ uint64_t rid;
+ uint64_t quota;
+ zfsvfs_t *zfsvfs;
+ int err;
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) != 0)
+ return (EINVAL);
+ }
/*
- * First validate permission to set all of the properties
+ * A correctly constructed propname is encoded as
+ * userquota@<rid>-<domain>.
*/
- VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
+ if ((dash = strchr(propname, '-')) == NULL ||
+ nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+ vallen != 3)
+ return (EINVAL);
- if (prop == ZPROP_INVAL) {
- /*
- * If this is a user-defined property, it must be a
- * string, and there is no further validation to do.
- */
- if (zfs_prop_user(propname) &&
- nvpair_type(elem) == DATA_TYPE_STRING) {
- if (error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_USERPROP, CRED()))
- return (error);
- continue;
- }
+ domain = dash + 1;
+ type = valary[0];
+ rid = valary[1];
+ quota = valary[2];
- if (!issnap && zfs_prop_userquota(propname) &&
- nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) {
- const char *perm;
- const char *up = zfs_userquota_prop_prefixes
- [ZFS_PROP_USERQUOTA];
- if (strncmp(propname, up, strlen(up)) == 0)
- perm = ZFS_DELEG_PERM_USERQUOTA;
- else
- perm = ZFS_DELEG_PERM_GROUPQUOTA;
- if (error = zfs_secpolicy_write_perms(name,
- perm, CRED()))
- return (error);
- continue;
- }
+ err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
+ if (err == 0) {
+ err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+ zfsvfs_rele(zfsvfs, FTAG);
+ }
- return (EINVAL);
- }
+ return (err);
+}
- if (issnap)
- return (EINVAL);
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err;
- if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
- return (error);
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_userquota(propname))
+ return (zfs_prop_set_userquota(dsname, pair));
+ return (-1);
+ }
- /*
- * Check that this value is valid for this pool version
- */
- switch (prop) {
- case ZFS_PROP_COMPRESSION:
- /*
- * If the user specified gzip compression, make sure
- * the SPA supports it. We ignore any errors here since
- * we'll catch them later.
- */
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0) {
- if (intval >= ZIO_COMPRESS_GZIP_1 &&
- intval <= ZIO_COMPRESS_GZIP_9 &&
- zfs_earlier_version(name,
- SPA_VERSION_GZIP_COMPRESSION))
- return (ENOTSUP);
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
- /*
- * If this is a bootable dataset then
- * verify that the compression algorithm
- * is supported for booting. We must return
- * something other than ENOTSUP since it
- * implies a downrev pool version.
- */
- if (zfs_is_bootfs(name) &&
- !BOOTFS_COMPRESS_VALID(intval))
- return (ERANGE);
- }
- break;
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
+ return (-1);
- case ZFS_PROP_COPIES:
- if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS))
- return (ENOTSUP);
- break;
+ VERIFY(0 == nvpair_value_uint64(pair, &intval));
- case ZFS_PROP_SHARESMB:
- if (zpl_earlier_version(name, ZPL_VERSION_FUID))
- return (ENOTSUP);
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ err = dsl_dir_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFQUOTA:
+ err = dsl_dataset_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_RESERVATION:
+ err = dsl_dir_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFRESERVATION:
+ err = dsl_dataset_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_VOLSIZE:
+ err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
+ intval);
+ break;
+ case ZFS_PROP_VERSION:
+ {
+ zfsvfs_t *zfsvfs;
+
+ if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
break;
- case ZFS_PROP_ACLINHERIT:
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0)
- if (intval == ZFS_ACL_PASSTHROUGH_X &&
- zfs_earlier_version(name,
- SPA_VERSION_PASSTHROUGH_X))
- return (ENOTSUP);
- }
- }
+ err = zfs_set_version(zfsvfs, intval);
+ zfsvfs_rele(zfsvfs, FTAG);
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
- zfs_prop_t prop = zfs_name_to_prop(propname);
+ if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+ zfs_cmd_t *zc;
- if (prop == ZPROP_INVAL) {
- if (zfs_prop_userquota(propname)) {
- uint64_t *valary;
- unsigned int vallen;
- const char *domain;
- zfs_userquota_prop_t type;
- uint64_t rid;
- uint64_t quota;
- zfsvfs_t *zfsvfs;
-
- VERIFY(nvpair_value_uint64_array(elem,
- &valary, &vallen) == 0);
- VERIFY(vallen == 3);
- type = valary[0];
- rid = valary[1];
- quota = valary[2];
- domain = propname +
- strlen(zfs_userquota_prop_prefixes[type]);
-
- error = zfsvfs_hold(name, B_FALSE, FTAG,
- &zfsvfs);
- if (error == 0) {
- error = zfs_set_userquota(zfsvfs,
- type, domain, rid, quota);
- zfsvfs_rele(zfsvfs, FTAG);
- }
- if (error == 0)
- continue;
- else
- goto out;
- } else if (zfs_prop_user(propname)) {
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- error = dsl_prop_set(name, propname, 1,
- strlen(strval) + 1, strval);
- if (error == 0)
- continue;
- else
- goto out;
- }
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strcpy(zc->zc_name, dsname);
+ (void) zfs_ioc_userspace_upgrade(zc);
+ kmem_free(zc, sizeof (zfs_cmd_t));
}
+ break;
+ }
- switch (prop) {
- case ZFS_PROP_QUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_quota(name, intval)) != 0)
- goto out;
- break;
-
- case ZFS_PROP_REFQUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_quota(name, intval)) != 0)
- goto out;
- break;
-
- case ZFS_PROP_RESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_reservation(name,
- intval)) != 0)
- goto out;
- break;
+ default:
+ err = -1;
+ }
- case ZFS_PROP_REFRESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dataset_set_reservation(name,
- intval)) != 0)
- goto out;
- break;
+ return (err);
+}
- case ZFS_PROP_VOLSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volsize(name,
- ddi_driver_major(zfs_dip), intval)) != 0)
- goto out;
- break;
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the first error
+ * encountered. If the caller provides a non-NULL errlist, it also gives the
+ * complete list of names of all the properties it failed to set along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property is set successfully, zero is returned and the list pointed
+ * at by errlist is NULL.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+ nvlist_t **errlist)
+{
+ nvpair_t *pair;
+ nvpair_t *propval;
+ int rv = 0;
+ uint64_t intval;
+ char *strval;
+ nvlist_t *genericnvl;
+ nvlist_t *errors;
+ nvlist_t *retrynvl;
- case ZFS_PROP_VOLBLOCKSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volblocksize(name, intval)) != 0)
- goto out;
- break;
+ VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- case ZFS_PROP_VERSION:
- {
- zfsvfs_t *zfsvfs;
-
- if ((error = nvpair_value_uint64(elem, &intval)) != 0)
- goto out;
- if ((error = zfsvfs_hold(name, B_FALSE, FTAG,
- &zfsvfs)) != 0)
- goto out;
- error = zfs_set_version(zfsvfs, intval);
- zfsvfs_rele(zfsvfs, FTAG);
-
- if (error == 0 && intval >= ZPL_VERSION_USERSPACE) {
- zfs_cmd_t zc = { 0 };
- (void) strcpy(zc.zc_name, name);
- (void) zfs_ioc_userspace_upgrade(&zc);
- }
- if (error)
- goto out;
- break;
+retry:
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ int err = 0;
+
+ /* decode the property value */
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) != 0)
+ err = EINVAL;
}
- default:
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- if (zfs_prop_get_type(prop) !=
- PROP_TYPE_STRING) {
- error = EINVAL;
- goto out;
- }
- } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ /* Validate value type */
+ if (err == 0 && prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (nvpair_type(propval) != DATA_TYPE_STRING)
+ err = EINVAL;
+ } else if (zfs_prop_userquota(propname)) {
+ if (nvpair_type(propval) !=
+ DATA_TYPE_UINT64_ARRAY)
+ err = EINVAL;
+ }
+ } else if (err == 0) {
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+ err = EINVAL;
+ } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
const char *unused;
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+ VERIFY(nvpair_value_uint64(propval,
+ &intval) == 0);
switch (zfs_prop_get_type(prop)) {
case PROP_TYPE_NUMBER:
break;
case PROP_TYPE_STRING:
- error = EINVAL;
- goto out;
+ err = EINVAL;
+ break;
case PROP_TYPE_INDEX:
if (zfs_prop_index_to_string(prop,
- intval, &unused) != 0) {
- error = EINVAL;
- goto out;
- }
+ intval, &unused) != 0)
+ err = EINVAL;
break;
default:
cmn_err(CE_PANIC,
"unknown property type");
- break;
}
} else {
- error = EINVAL;
- goto out;
+ err = EINVAL;
+ }
+ }
+
+ /* Validate permissions */
+ if (err == 0)
+ err = zfs_check_settable(dsname, pair, CRED());
+
+ if (err == 0) {
+ err = zfs_prop_set_special(dsname, source, pair);
+ if (err == -1) {
+ /*
+ * For better performance we build up a list of
+ * properties to set in a single transaction.
+ */
+ err = nvlist_add_nvpair(genericnvl, pair);
+ } else if (err != 0 && nvl != retrynvl) {
+ /*
+ * This may be a spurious error caused by
+ * receiving quota and reservation out of order.
+ * Try again in a second pass.
+ */
+ err = nvlist_add_nvpair(retrynvl, pair);
}
- if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0)
- goto out;
}
+
+ if (err != 0)
+ VERIFY(nvlist_add_int32(errors, propname, err) == 0);
}
- if (nvlist_next_nvpair(genericnvl, NULL) != NULL) {
- error = dsl_props_set(name, genericnvl);
+ if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+ nvl = retrynvl;
+ goto retry;
+ }
+
+ if (!nvlist_empty(genericnvl) &&
+ dsl_props_set(dsname, source, genericnvl) != 0) {
+ /*
+ * If this fails, we still want to set as many properties as we
+ * can, so try setting them individually.
+ */
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ int err = 0;
+
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) == 0);
+ }
+
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ VERIFY(nvpair_value_string(propval,
+ &strval) == 0);
+ err = dsl_prop_set(dsname, propname, source, 1,
+ strlen(strval) + 1, strval);
+ } else {
+ VERIFY(nvpair_value_uint64(propval,
+ &intval) == 0);
+ err = dsl_prop_set(dsname, propname, source, 8,
+ 1, &intval);
+ }
+
+ if (err != 0) {
+ VERIFY(nvlist_add_int32(errors, propname,
+ err) == 0);
+ }
+ }
}
-out:
nvlist_free(genericnvl);
- return (error);
+ nvlist_free(retrynvl);
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
}
/*
@@ -1881,15 +2369,15 @@ out:
static int
zfs_check_userprops(char *fsname, nvlist_t *nvl)
{
- nvpair_t *elem = NULL;
+ nvpair_t *pair = NULL;
int error = 0;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- const char *propname = nvpair_name(elem);
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
char *valstr;
if (!zfs_prop_user(propname) ||
- nvpair_type(elem) != DATA_TYPE_STRING)
+ nvpair_type(pair) != DATA_TYPE_STRING)
return (EINVAL);
if (error = zfs_secpolicy_write_perms(fsname,
@@ -1899,49 +2387,96 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl)
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG);
- VERIFY(nvpair_value_string(elem, &valstr) == 0);
+ VERIFY(nvpair_value_string(pair, &valstr) == 0);
if (strlen(valstr) >= ZAP_MAXVALUELEN)
return (E2BIG);
}
return (0);
}
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+ nvpair_t *pair;
+
+ VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+ if (nvlist_exists(skipped, nvpair_name(pair)))
+ continue;
+
+ VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+ }
+}
+
+static int
+clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+ nvlist_t *skipped)
+{
+ int err = 0;
+ nvlist_t *cleared_props = NULL;
+ props_skip(props, skipped, &cleared_props);
+ if (!nvlist_empty(cleared_props)) {
+ /*
+ * Acts on local properties until the dataset has received
+ * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+ */
+ zprop_source_t flags = (ZPROP_SRC_NONE |
+ (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
+ err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+ }
+ nvlist_free(cleared_props);
+ return (err);
+}
+
/*
* inputs:
* zc_name name of filesystem
* zc_value name of property to set
* zc_nvlist_src{_size} nvlist of properties to apply
- * zc_cookie clear existing local props?
+ * zc_cookie received properties flag
*
- * outputs: none
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
*/
static int
zfs_ioc_set_prop(zfs_cmd_t *zc)
{
nvlist_t *nvl;
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+ ZPROP_SRC_LOCAL);
+ nvlist_t *errors = NULL;
int error;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvl)) != 0)
+ zc->zc_iflags, &nvl)) != 0)
return (error);
- if (zc->zc_cookie) {
+ if (received) {
nvlist_t *origprops;
objset_t *os;
- if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
- clear_props(zc->zc_name, origprops, nvl);
+ if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
+ if (dsl_prop_get_received(os, &origprops) == 0) {
+ (void) clear_received_props(os,
+ zc->zc_name, origprops, nvl);
nvlist_free(origprops);
}
- dmu_objset_close(os);
- }
+ dsl_prop_set_hasrecvd(os);
+ dmu_objset_rele(os, FTAG);
+ }
}
- error = zfs_set_prop_nvlist(zc->zc_name, nvl);
+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors);
+ if (zc->zc_nvlist_dst != 0 && errors != NULL) {
+ (void) put_nvlist(zc, errors);
+ }
+
+ nvlist_free(errors);
nvlist_free(nvl);
return (error);
}
@@ -1950,14 +2485,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
* inputs:
* zc_name name of filesystem
* zc_value name of property to inherit
+ * zc_cookie revert to received value if TRUE
*
* outputs: none
*/
static int
zfs_ioc_inherit_prop(zfs_cmd_t *zc)
{
+ const char *propname = zc->zc_value;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received
+ ? ZPROP_SRC_NONE /* revert to received value, if any */
+ : ZPROP_SRC_INHERITED); /* explicitly inherit */
+
+ if (received) {
+ nvlist_t *dummy;
+ nvpair_t *pair;
+ zprop_type_t type;
+ int err;
+
+ /*
+ * zfs_prop_set_special() expects properties in the form of an
+ * nvpair with type info.
+ */
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(propname))
+ return (EINVAL);
+
+ type = PROP_TYPE_STRING;
+ } else if (prop == ZFS_PROP_VOLSIZE ||
+ prop == ZFS_PROP_VERSION) {
+ return (EINVAL);
+ } else {
+ type = zfs_prop_get_type(prop);
+ }
+
+ VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ switch (type) {
+ case PROP_TYPE_STRING:
+ VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+ break;
+ case PROP_TYPE_NUMBER:
+ case PROP_TYPE_INDEX:
+ VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+ break;
+ default:
+ nvlist_free(dummy);
+ return (EINVAL);
+ }
+
+ pair = nvlist_next_nvpair(dummy, NULL);
+ err = zfs_prop_set_special(zc->zc_name, source, pair);
+ nvlist_free(dummy);
+ if (err != -1)
+ return (err); /* special property already handled */
+ } else {
+ /*
+ * Only check this in the non-received case. We want to allow
+ * 'inherit -S' to revert non-inheritable properties like quota
+ * and reservation to the received or default values even though
+ * they are not considered inheritable.
+ */
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ return (EINVAL);
+ }
+
/* the property name has been validated by zfs_secpolicy_inherit() */
- return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+ return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
}
static int
@@ -1966,28 +2562,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
nvlist_t *props;
spa_t *spa;
int error;
- nvpair_t *elem;
+ nvpair_t *pair;
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)))
+ if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))
return (error);
/*
* If the only property is the configfile, then just do a spa_lookup()
* to handle the faulted case.
*/
- elem = nvlist_next_nvpair(props, NULL);
- if (elem != NULL && strcmp(nvpair_name(elem),
+ pair = nvlist_next_nvpair(props, NULL);
+ if (pair != NULL && strcmp(nvpair_name(pair),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
- nvlist_next_nvpair(props, elem) == NULL) {
+ nvlist_next_nvpair(props, pair) == NULL) {
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_config_sync(spa, B_FALSE, B_TRUE);
}
mutex_exit(&spa_namespace_lock);
- if (spa != NULL)
+ if (spa != NULL) {
+ nvlist_free(props);
return (0);
+ }
}
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
@@ -2034,57 +2632,6 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
return (error);
}
-static int
-zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
-{
-#ifdef sun
- nvlist_t *nvp;
- int error;
- uint32_t uid;
- uint32_t gid;
- uint32_t *groups;
- uint_t group_cnt;
- cred_t *usercred;
-
- if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvp)) != 0) {
- return (error);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_UID, &uid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32(nvp,
- ZFS_DELEG_PERM_GID, &gid)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
-
- if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
- &groups, &group_cnt)) != 0) {
- nvlist_free(nvp);
- return (EPERM);
- }
- usercred = cralloc();
- if ((crsetugid(usercred, uid, gid) != 0) ||
- (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
- nvlist_free(nvp);
- crfree(usercred);
- return (EPERM);
- }
- nvlist_free(nvp);
- error = dsl_deleg_access(zc->zc_name,
- zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
- crfree(usercred);
- return (error);
-#else /* sun */
- return (EPERM);
-#endif /* sun */
-}
-
/*
* inputs:
* zc_name name of filesystem
@@ -2100,7 +2647,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
nvlist_t *fsaclnv = NULL;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &fsaclnv)) != 0)
+ zc->zc_iflags, &fsaclnv)) != 0)
return (error);
/*
@@ -2157,30 +2704,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
}
/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
- return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name name of volume
- *
- * outputs: none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
- return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
* Search the vfs list for a specified resource. Returns a pointer to it
* or NULL if no suitable entry is found. The caller of this routine
* is responsible for releasing the returned vfs pointer.
@@ -2234,8 +2757,8 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
*/
static int
zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
- boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
- boolean_t *is_ci)
+ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
{
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
@@ -2271,6 +2794,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
(zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+ (zplver >= ZPL_VERSION_SA && !sa_ok) ||
(zplver < ZPL_VERSION_NORMALIZATION &&
(norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
sense != ZFS_PROP_UNDEFINED)))
@@ -2312,11 +2836,13 @@ static int
zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
char parentname[MAXNAMELEN];
char *cp;
+ spa_t *spa;
+ uint64_t spa_vers;
int error;
(void) strlcpy(parentname, dataset, sizeof (parentname));
@@ -2324,23 +2850,25 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
ASSERT(cp != NULL);
cp[0] = '\0';
- if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
- zplver = ZPL_VERSION_USERSPACE - 1;
- if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
+ if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_vers = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
/*
* Open parent object set so we can inherit zplprop values.
*/
- if ((error = dmu_objset_open(parentname, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os)) != 0)
+ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
return (error);
- error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
zplprops, is_ci);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (error);
}
@@ -2348,17 +2876,17 @@ static int
zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok;
+ boolean_t sa_ok;
uint64_t zplver = ZPL_VERSION;
int error;
- if (spa_vers < SPA_VERSION_FUID) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
- error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
- zplprops, is_ci);
+ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+ createprops, zplprops, is_ci);
return (error);
}
@@ -2401,7 +2929,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
zct.zct_zplprops = NULL;
@@ -2417,21 +2945,18 @@ zfs_ioc_create(zfs_cmd_t *zc)
return (EINVAL);
}
- error = dmu_objset_open(zc->zc_value, type,
- DS_MODE_USER | DS_MODE_READONLY, &clone);
+ error = dmu_objset_hold(zc->zc_value, FTAG, &clone);
if (error) {
nvlist_free(nvprops);
return (error);
}
- error = dmu_objset_create(zc->zc_name, type, clone, 0,
- NULL, NULL);
+ error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
+ dmu_objset_rele(clone, FTAG);
if (error) {
- dmu_objset_close(clone);
nvlist_free(nvprops);
return (error);
}
- dmu_objset_close(clone);
} else {
boolean_t is_insensitive = B_FALSE;
@@ -2488,7 +3013,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
return (error);
}
}
- error = dmu_objset_create(zc->zc_name, type, NULL,
+ error = dmu_objset_create(zc->zc_name, type,
is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
nvlist_free(zct.zct_zplprops);
}
@@ -2497,10 +3022,16 @@ zfs_ioc_create(zfs_cmd_t *zc)
* It would be nice to do this atomically.
*/
if (error == 0) {
- if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
- (void) dmu_objset_destroy(zc->zc_name);
+ error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL,
+ nvprops, NULL);
+ if (error != 0)
+ (void) dmu_objset_destroy(zc->zc_name, B_FALSE);
}
nvlist_free(nvprops);
+#ifdef __FreeBSD__
+ if (error == 0 && type == DMU_OST_ZVOL)
+ zvol_create_minors(zc->zc_name);
+#endif
return (error);
}
@@ -2511,7 +3042,8 @@ zfs_ioc_create(zfs_cmd_t *zc)
* zc_cookie recursive flag
* zc_nvlist_src[_size] property list
*
- * outputs: none
+ * outputs:
+ * zc_value short snapname (i.e. part after the '@')
*/
static int
zfs_ioc_snapshot(zfs_cmd_t *zc)
@@ -2525,21 +3057,21 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
error = zfs_check_userprops(zc->zc_name, nvprops);
if (error)
goto out;
- if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL &&
+ if (!nvlist_empty(nvprops) &&
zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
error = ENOTSUP;
goto out;
}
- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
- nvprops, recursive);
+ error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL,
+ nvprops, recursive, B_FALSE, -1);
out:
nvlist_free(nvprops);
@@ -2547,20 +3079,15 @@ out:
}
int
-zfs_unmount_snap(char *name, void *arg)
+zfs_unmount_snap(const char *name, void *arg)
{
vfs_t *vfsp = NULL;
if (arg) {
char *snapname = arg;
- int len = strlen(name) + strlen(snapname) + 2;
- char *buf = kmem_alloc(len, KM_SLEEP);
-
- (void) strcpy(buf, name);
- (void) strcat(buf, "@");
- (void) strcat(buf, snapname);
- vfsp = zfs_get_vfs(buf);
- kmem_free(buf, len);
+ char *fullname = kmem_asprintf("%s@%s", name, snapname);
+ vfsp = zfs_get_vfs(fullname);
+ strfree(fullname);
} else if (strchr(name, '@')) {
vfsp = zfs_get_vfs(name);
}
@@ -2586,8 +3113,9 @@ zfs_unmount_snap(char *name, void *arg)
/*
* inputs:
- * zc_name name of filesystem
- * zc_value short name of snapshot
+ * zc_name name of filesystem
+ * zc_value short name of snapshot
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
@@ -2602,26 +3130,32 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
if (err)
return (err);
- return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+ return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
+ zc->zc_defer_destroy));
}
/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
static int
zfs_ioc_destroy(zfs_cmd_t *zc)
{
+ int err;
if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
+ err = zfs_unmount_snap(zc->zc_name, NULL);
if (err)
return (err);
}
- return (dmu_objset_destroy(zc->zc_name));
+ err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+ if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+ (void) zvol_remove_minor(zc->zc_name);
+ return (err);
}
/*
@@ -2633,38 +3167,78 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
static int
zfs_ioc_rollback(zfs_cmd_t *zc)
{
- objset_t *os;
+ dsl_dataset_t *ds, *clone;
int error;
- zfsvfs_t *zfsvfs = NULL;
+ zfsvfs_t *zfsvfs;
+ char *clone_name;
+
+ error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
+ if (error)
+ return (error);
+
+ /* must not be a snapshot */
+ if (dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EINVAL);
+ }
+
+ /* must have a most recent snapshot */
+ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EINVAL);
+ }
/*
- * Get the zfsvfs for the receiving objset. There
- * won't be one if we're operating on a zvol, if the
- * objset doesn't exist yet, or is not mounted.
+ * Create clone of most recent snapshot.
*/
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+ clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
+ error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
if (error)
- return (error);
+ goto out;
- if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
- int mode;
+ error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
+ if (error)
+ goto out;
- error = zfs_suspend_fs(zfsvfs, NULL, &mode);
+ /*
+ * Do clone swap.
+ */
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
int resume_err;
- error = dmu_objset_rollback(os);
- resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
+ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+ error = dsl_dataset_clone_swap(clone, ds,
+ B_TRUE);
+ dsl_dataset_disown(ds, FTAG);
+ ds = NULL;
+ } else {
+ error = EBUSY;
+ }
+ resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
error = error ? error : resume_err;
- } else {
- dmu_objset_close(os);
}
VFS_RELE(zfsvfs->z_vfs);
} else {
- error = dmu_objset_rollback(os);
+ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+ error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
+ dsl_dataset_disown(ds, FTAG);
+ ds = NULL;
+ } else {
+ error = EBUSY;
+ }
}
- /* Note, the dmu_objset_rollback() releases the objset for us. */
+ /*
+ * Destroy clone (which also closes it).
+ */
+ (void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+
+out:
+ strfree(clone_name);
+ if (ds)
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
@@ -2697,31 +3271,267 @@ zfs_ioc_rename(zfs_cmd_t *zc)
if (err)
return (err);
}
+ if (zc->zc_objset_type == DMU_OST_ZVOL)
+ (void) zvol_remove_minor(zc->zc_name);
return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
}
-static void
-clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+ const char *propname = nvpair_name(pair);
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (err = zfs_secpolicy_write_perms(dsname,
+ ZFS_DELEG_PERM_USERPROP, cr))
+ return (err);
+ return (0);
+ }
+
+ if (!issnap && zfs_prop_userquota(propname)) {
+ const char *perm = NULL;
+ const char *uq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+ const char *gq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+
+ if (strncmp(propname, uq_prefix,
+ strlen(uq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_USERQUOTA;
+ } else if (strncmp(propname, gq_prefix,
+ strlen(gq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_GROUPQUOTA;
+ } else {
+ /* USERUSED and GROUPUSED are read-only */
+ return (EINVAL);
+ }
+
+ if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
+ return (err);
+ return (0);
+ }
+
+ return (EINVAL);
+ }
+
+ if (issnap)
+ return (EINVAL);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ /*
+ * Check that this value is valid for this pool version
+ */
+ switch (prop) {
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval >= ZIO_COMPRESS_GZIP_1 &&
+ intval <= ZIO_COMPRESS_GZIP_9 &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_GZIP_COMPRESSION)) {
+ return (ENOTSUP);
+ }
+
+ if (intval == ZIO_COMPRESS_ZLE &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_ZLE_COMPRESSION))
+ return (ENOTSUP);
+
+ /*
+ * If this is a bootable dataset then
+ * verify that the compression algorithm
+ * is supported for booting. We must return
+ * something other than ENOTSUP since it
+ * implies a downrev pool version.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ !BOOTFS_COMPRESS_VALID(intval)) {
+ return (ERANGE);
+ }
+ }
+ break;
+
+ case ZFS_PROP_COPIES:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_DEDUP:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_SHARESMB:
+ if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+ return (ENOTSUP);
+ break;
+
+ case ZFS_PROP_ACLINHERIT:
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval == ZFS_ACL_PASSTHROUGH_X &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_PASSTHROUGH_X))
+ return (ENOTSUP);
+ }
+ break;
+ }
+
+ return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
{
zfs_cmd_t *zc;
- nvpair_t *prop;
+ nvpair_t *pair, *next_pair;
+ nvlist_t *errors;
+ int err, rv = 0;
if (props == NULL)
- return;
+ return (0);
+
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
(void) strcpy(zc->zc_name, dataset);
- for (prop = nvlist_next_nvpair(props, NULL); prop;
- prop = nvlist_next_nvpair(props, prop)) {
- if (newprops != NULL &&
- nvlist_exists(newprops, nvpair_name(prop)))
- continue;
- (void) strcpy(zc->zc_value, nvpair_name(prop));
- if (zfs_secpolicy_inherit(zc, CRED()) == 0)
- (void) zfs_ioc_inherit_prop(zc);
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ (void) strcpy(zc->zc_value, nvpair_name(pair));
+ if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+ (err = zfs_secpolicy_inherit(zc, CRED())) != 0) {
+ VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+ VERIFY(nvlist_add_int32(errors,
+ zc->zc_value, err) == 0);
+ }
+ pair = next_pair;
}
kmem_free(zc, sizeof (zfs_cmd_t));
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+ if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+ /* dsl_prop_get_all_impl() format */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p1) == 0);
+ }
+
+ if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p2) == 0);
+ }
+
+ if (nvpair_type(p1) != nvpair_type(p2))
+ return (B_FALSE);
+
+ if (nvpair_type(p1) == DATA_TYPE_STRING) {
+ char *valstr1, *valstr2;
+
+ VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+ VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+ return (strcmp(valstr1, valstr2) == 0);
+ } else {
+ uint64_t intval1, intval2;
+
+ VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+ VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+ return (intval1 == intval2);
+ }
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+ nvpair_t *pair, *next_pair;
+
+ if (origprops == NULL)
+ return; /* all props need to be received */
+
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ const char *propname = nvpair_name(pair);
+ nvpair_t *match;
+
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ if ((nvlist_lookup_nvpair(origprops, propname,
+ &match) != 0) || !propval_equals(pair, match))
+ goto next; /* need to set received value */
+
+ /* don't clear the existing received value */
+ (void) nvlist_remove_nvpair(origprops, match);
+ /* don't bother receiving the property */
+ (void) nvlist_remove_nvpair(props, pair);
+next:
+ pair = next_pair;
+ }
}
+#ifdef DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
/*
* inputs:
* zc_name name of containing filesystem
@@ -2731,9 +3541,14 @@ clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
* zc_cookie file descriptor to recv from
* zc_begin_record the BEGIN record of the stream (not byteswapped)
* zc_guid force flag
+ * zc_cleanup_fd cleanup-on-exit file descriptor
+ * zc_action_handle handle for this guid/ds mapping (or zero on first call)
*
* outputs:
* zc_cookie number of bytes read
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ * zc_obj zprop_errflags_t
+ * zc_action_handle handle for this guid/ds mapping
*/
static int
zfs_ioc_recv(zfs_cmd_t *zc)
@@ -2741,15 +3556,18 @@ zfs_ioc_recv(zfs_cmd_t *zc)
file_t *fp;
objset_t *os;
dmu_recv_cookie_t drc;
- zfsvfs_t *zfsvfs = NULL;
boolean_t force = (boolean_t)zc->zc_guid;
- int error, fd;
+ int fd;
+ int error = 0;
+ int props_error = 0;
+ nvlist_t *errors;
offset_t off;
- nvlist_t *props = NULL;
- nvlist_t *origprops = NULL;
+ nvlist_t *props = NULL; /* sent properties */
+ nvlist_t *origprops = NULL; /* existing properties */
objset_t *origin = NULL;
char *tosnap;
char tofs[ZFS_MAXNAMELEN];
+ boolean_t first_recvd_props = B_FALSE;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '@') == NULL ||
@@ -2758,123 +3576,204 @@ zfs_ioc_recv(zfs_cmd_t *zc)
(void) strcpy(tofs, zc->zc_value);
tosnap = strchr(tofs, '@');
- *tosnap = '\0';
- tosnap++;
+ *tosnap++ = '\0';
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)) != 0)
+ zc->zc_iflags, &props)) != 0)
return (error);
fd = zc->zc_cookie;
- fp = getf(fd, 0);
+ fp = getf(fd);
if (fp == NULL) {
nvlist_free(props);
return (EBADF);
}
- if (getzfsvfs(tofs, &zfsvfs) == 0) {
- if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
- VFS_RELE(zfsvfs->z_vfs);
- zfsvfs = NULL;
- error = EBUSY;
- goto out;
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
+ if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
+ !dsl_prop_get_hasrecvd(os)) {
+ first_recvd_props = B_TRUE;
}
+
/*
- * If new properties are supplied, they are to completely
- * replace the existing ones, so stash away the existing ones.
- */
- if (props)
- (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
- } else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- /*
- * Get the props even if there was no zfsvfs (zvol or
- * unmounted zpl).
+ * If new received properties are supplied, they are to
+ * completely replace the existing received properties, so stash
+ * away the existing ones.
*/
- (void) dsl_prop_get_all(os, &origprops, TRUE);
+ if (dsl_prop_get_received(os, &origprops) == 0) {
+ nvlist_t *errlist = NULL;
+ /*
+ * Don't bother writing a property if its value won't
+ * change (and avoid the unnecessary security checks).
+ *
+ * The first receive after SPA_VERSION_RECVD_PROPS is a
+ * special case where we blow away all local properties
+ * regardless.
+ */
+ if (!first_recvd_props)
+ props_reduce(props, origprops);
+ if (zfs_check_clearable(tofs, origprops,
+ &errlist) != 0)
+ (void) nvlist_merge(errors, errlist, 0);
+ nvlist_free(errlist);
+ }
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
if (zc->zc_string[0]) {
- error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &origin);
+ error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
if (error)
goto out;
}
- error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
- force, origin, zfsvfs != NULL, &drc);
+ error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
+ &zc->zc_begin_record, force, origin, &drc);
if (origin)
- dmu_objset_close(origin);
+ dmu_objset_rele(origin, FTAG);
if (error)
goto out;
/*
- * Reset properties. We do this before we receive the stream
- * so that the properties are applied to the new data.
+ * Set properties before we receive the stream so that they are applied
+ * to the new data. Note that we must call dmu_recv_stream() if
+ * dmu_recv_begin() succeeds.
*/
if (props) {
- clear_props(tofs, origprops, props);
+ nvlist_t *errlist;
+
+ if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
+ if (drc.drc_newfs) {
+ if (spa_version(os->os_spa) >=
+ SPA_VERSION_RECVD_PROPS)
+ first_recvd_props = B_TRUE;
+ } else if (origprops != NULL) {
+ if (clear_received_props(os, tofs, origprops,
+ first_recvd_props ? NULL : props) != 0)
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ } else {
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ }
+ dsl_prop_set_hasrecvd(os);
+ } else if (!drc.drc_newfs) {
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ }
+
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ props, &errlist);
+ (void) nvlist_merge(errors, errlist, 0);
+ nvlist_free(errlist);
+ }
+
+ if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) {
/*
- * XXX - Note, this is all-or-nothing; should be best-effort.
+ * Caller made zc->zc_nvlist_dst less than the minimum expected
+ * size or supplied an invalid address.
*/
- (void) zfs_set_prop_nvlist(tofs, props);
+ props_error = EINVAL;
}
off = fp->f_offset;
- error = dmu_recv_stream(&drc, fp, &off);
+ error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
+ &zc->zc_action_handle);
- if (error == 0 && zfsvfs) {
- char *osname;
- int mode;
+ if (error == 0) {
+ zfsvfs_t *zfsvfs = NULL;
- /* online recv */
- osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
- if (error == 0) {
- int resume_err;
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ /* online recv */
+ int end_err;
- error = dmu_recv_end(&drc);
- resume_err = zfs_resume_fs(zfsvfs, osname, mode);
- error = error ? error : resume_err;
+ error = zfs_suspend_fs(zfsvfs);
+ /*
+ * If the suspend fails, then the recv_end will
+ * likely also fail, and clean up after itself.
+ */
+ end_err = dmu_recv_end(&drc);
+ if (error == 0)
+ error = zfs_resume_fs(zfsvfs, tofs);
+ error = error ? error : end_err;
+ VFS_RELE(zfsvfs->z_vfs);
} else {
- dmu_recv_abort_cleanup(&drc);
+ error = dmu_recv_end(&drc);
}
- kmem_free(osname, MAXNAMELEN);
- } else if (error == 0) {
- error = dmu_recv_end(&drc);
}
zc->zc_cookie = off - fp->f_offset;
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
+#ifdef DEBUG
+ if (zfs_ioc_recv_inject_err) {
+ zfs_ioc_recv_inject_err = B_FALSE;
+ error = 1;
+ }
+#endif
/*
* On error, restore the original props.
*/
if (error && props) {
- clear_props(tofs, props, NULL);
- (void) zfs_set_prop_nvlist(tofs, origprops);
+ if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+ if (clear_received_props(os, tofs, props, NULL) != 0) {
+ /*
+ * We failed to clear the received properties.
+ * Since we may have left a $recvd value on the
+ * system, we can't clear the $hasrecvd flag.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ } else if (first_recvd_props) {
+ dsl_prop_unset_hasrecvd(os);
+ }
+ dmu_objset_rele(os, FTAG);
+ } else if (!drc.drc_newfs) {
+ /* We failed to clear the received properties. */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+
+ if (origprops == NULL && !drc.drc_newfs) {
+ /* We failed to stash the original properties. */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+
+ /*
+ * dsl_props_set() will not convert RECEIVED to LOCAL on or
+ * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+ * explictly if we're restoring local properties cleared in the
+ * first new-style receive.
+ */
+ if (origprops != NULL &&
+ zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+ ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+ origprops, NULL) != 0) {
+ /*
+ * We stashed the original properties but failed to
+ * restore them.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
}
out:
- if (zfsvfs) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
nvlist_free(props);
nvlist_free(origprops);
- releasef(fp);
+ nvlist_free(errors);
+ releasef(fd);
+
+ if (error == 0)
+ error = props_error;
+
return (error);
}
/*
* inputs:
* zc_name name of snapshot to send
- * zc_value short name of incremental fromsnap (may be empty)
* zc_cookie file descriptor to send stream to
- * zc_obj fromorigin flag (mutually exclusive with zc_value)
+ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj objsetid of snapshot to send
+ * zc_fromobj objsetid of incremental fromsnap (may be zero)
*
* outputs: none
*/
@@ -2886,36 +3785,55 @@ zfs_ioc_send(zfs_cmd_t *zc)
file_t *fp;
int error;
offset_t off;
+ dsl_dataset_t *ds;
+ dsl_dataset_t *dsfrom = NULL;
+ spa_t *spa;
+ dsl_pool_t *dp;
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &tosnap);
+ error = spa_open(zc->zc_name, &spa, FTAG);
if (error)
return (error);
- if (zc->zc_value[0] != '\0') {
- char *buf;
- char *cp;
-
- buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strncpy(buf, zc->zc_name, MAXPATHLEN);
- cp = strchr(buf, '@');
- if (cp)
- *(cp+1) = 0;
- (void) strlcat(buf, zc->zc_value, MAXPATHLEN);
- error = dmu_objset_open(buf, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &fromsnap);
- kmem_free(buf, MAXPATHLEN);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ error = dmu_objset_from_ds(ds, &tosnap);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_fromobj != 0) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
if (error) {
- dmu_objset_close(tosnap);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
+ error = dmu_objset_from_ds(dsfrom, &fromsnap);
+ if (error) {
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ } else {
+ spa_close(spa, FTAG);
}
- fp = getf(zc->zc_cookie, 1);
+ fp = getf(zc->zc_cookie);
if (fp == NULL) {
- dmu_objset_close(tosnap);
- if (fromsnap)
- dmu_objset_close(fromsnap);
+ dsl_dataset_rele(ds, FTAG);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
return (EBADF);
}
@@ -2924,10 +3842,10 @@ zfs_ioc_send(zfs_cmd_t *zc)
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
- releasef(fp);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- dmu_objset_close(tosnap);
+ releasef(zc->zc_cookie);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
@@ -3003,16 +3921,41 @@ zfs_ioc_clear(zfs_cmd_t *zc)
mutex_exit(&spa_namespace_lock);
return (EIO);
}
- if (spa->spa_log_state == SPA_LOG_MISSING) {
+ if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
/* we need to let spa_open/spa_load clear the chains */
- spa->spa_log_state = SPA_LOG_CLEAR;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
}
+ spa->spa_last_open_failed = 0;
mutex_exit(&spa_namespace_lock);
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ } else {
+ nvlist_t *policy;
+ nvlist_t *config = NULL;
+
+ if (zc->zc_nvlist_src == 0)
+ return (EINVAL);
+
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+ error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+ policy, &config);
+ if (config != NULL) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ nvlist_free(config);
+ }
+ nvlist_free(policy);
+ }
+ }
+
+ if (error)
return (error);
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if (zc->zc_guid == 0) {
vd = NULL;
@@ -3045,7 +3988,8 @@ zfs_ioc_clear(zfs_cmd_t *zc)
* zc_name name of filesystem
* zc_value name of origin snapshot
*
- * outputs: none
+ * outputs:
+ * zc_string name of conflicting snapshot, if there is one
*/
static int
zfs_ioc_promote(zfs_cmd_t *zc)
@@ -3061,7 +4005,7 @@ zfs_ioc_promote(zfs_cmd_t *zc)
*cp = '\0';
(void) dmu_objset_find(zc->zc_value,
zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
- return (dsl_dataset_promote(zc->zc_name));
+ return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
}
/*
@@ -3085,7 +4029,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (EINVAL);
- error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
@@ -3111,13 +4055,15 @@ static int
zfs_ioc_userspace_many(zfs_cmd_t *zc)
{
zfsvfs_t *zfsvfs;
- int error;
+ int bufsize = zc->zc_nvlist_dst_size;
- error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ if (bufsize <= 0)
+ return (ENOMEM);
+
+ int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
- int bufsize = zc->zc_nvlist_dst_size;
void *buf = kmem_alloc(bufsize, KM_SLEEP);
error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
@@ -3145,34 +4091,31 @@ static int
zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
{
objset_t *os;
- int error;
+ int error = 0;
zfsvfs_t *zfsvfs;
if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
- if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) {
+ if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
/*
* If userused is not enabled, it may be because the
* objset needs to be closed & reopened (to grow the
* objset_phys_t). Suspend/resume the fs will do that.
*/
- int mode;
- error = zfs_suspend_fs(zfsvfs, NULL, &mode);
- if (error == 0) {
- error = zfs_resume_fs(zfsvfs,
- zc->zc_name, mode);
- }
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0)
+ error = zfs_resume_fs(zfsvfs, zc->zc_name);
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
VFS_RELE(zfsvfs->z_vfs);
} else {
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_USER, &os);
+ /* XXX kind of reading contents without owning */
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error)
return (error);
error = dmu_objset_userspace_upgrade(os);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
}
return (error);
@@ -3219,7 +4162,7 @@ zfs_init_sharefs()
}
return (0);
}
-#endif /* sun */
+#endif /* sun */
static int
zfs_ioc_share(zfs_cmd_t *zc)
@@ -3314,15 +4257,123 @@ zfs_ioc_share(zfs_cmd_t *zc)
zc->zc_share.z_sharemax);
return (error);
-#else /* sun */
+
+#else /* !sun */
return (ENOSYS);
-#endif /* sun */
+#endif /* !sun */
}
ace_t full_access[] = {
{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
};
+/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_obj object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error)
+ return (error);
+
+ error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
+ os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value prefix name for snapshot
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+ char *snap_name;
+ int error;
+
+ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+ (u_longlong_t)ddi_get_lbolt64());
+
+ if (strlen(snap_name) >= MAXNAMELEN) {
+ strfree(snap_name);
+ return (E2BIG);
+ }
+
+ error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name,
+ NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd);
+ if (error != 0) {
+ strfree(snap_name);
+ return (error);
+ }
+
+ (void) strcpy(zc->zc_value, snap_name);
+ strfree(snap_name);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of "to" snapshot
+ * zc_value name of "from" snapshot
+ * zc_cookie file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+ objset_t *fromsnap;
+ objset_t *tosnap;
+ file_t *fp;
+ offset_t off;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+ if (error)
+ return (error);
+
+ error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap);
+ if (error) {
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+ }
+
+ fp = getf(zc->zc_cookie);
+ if (fp == NULL) {
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (EBADF);
+ }
+
+ off = fp->f_offset;
+
+ error = dmu_diff(tosnap, fromsnap, fp, &off);
+
+ if (off >= 0 && off <= MAXOFFSET_T)
+ fp->f_offset = off;
+ releasef(zc->zc_cookie);
+
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+}
+
#ifdef sun
/*
* Remove all ACL files in shares dir
@@ -3368,7 +4419,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
/* Now make sure mntpnt and dataset are ZFS */
- if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
zc->zc_name) != 0)) {
VN_RELE(vp);
@@ -3377,7 +4428,6 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
dzp = VTOZ(vp);
zfsvfs = dzp->z_zfsvfs;
-
ZFS_ENTER(zfsvfs);
/*
@@ -3440,7 +4490,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
case ZFS_SMB_ACL_RENAME:
if ((error = get_nvlist(zc->zc_nvlist_src,
- zc->zc_nvlist_src_size, &nvlist)) != 0) {
+ zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
ZFS_EXIT(zfsvfs);
return (error);
@@ -3451,6 +4501,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
VN_RELE(vp);
VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
+ nvlist_free(nvlist);
return (error);
}
error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
@@ -3479,6 +4530,127 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
}
/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this hold
+ * zc_cookie recursive flag
+ * zc_temphold set if hold is temporary
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ * zc_sendobj if non-zero, the objid for zc_name@zc_value
+ * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_hold(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+ minor_t minor = 0;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ if (zc->zc_sendobj == 0) {
+ return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive, zc->zc_temphold,
+ zc->zc_cleanup_fd));
+ }
+
+ if (recursive)
+ return (EINVAL);
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ /*
+ * Until we have a hold on this snapshot, it's possible that
+ * zc_sendobj could've been destroyed and reused as part
+ * of a later txg. Make sure we're looking at the right object.
+ */
+ if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOENT);
+ }
+
+ if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) {
+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ }
+
+ error = dsl_dataset_user_hold_for_send(ds, zc->zc_string,
+ zc->zc_temphold);
+ if (minor != 0) {
+ if (error == 0) {
+ dsl_register_onexit_hold_cleanup(ds, zc->zc_string,
+ minor);
+ }
+ zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of dataset from which we're releasing a user hold
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this hold
+ * zc_cookie recursive flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_release(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size} nvlist of snapshot holds
+ */
+static int
+zfs_ioc_get_holds(zfs_cmd_t *zc)
+{
+ nvlist_t *nvp;
+ int error;
+
+ if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
+ error = put_nvlist(zc, nvp);
+ nvlist_free(nvp);
+ }
+
+ return (error);
+}
+
+/*
* pool create, destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
* do the logging of those commands.
@@ -3514,7 +4686,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_FALSE },
{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
@@ -3534,6 +4706,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_TRUE },
{ zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE,
B_TRUE },
+ { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_TRUE },
{ zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_TRUE },
{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
@@ -3543,10 +4717,6 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_TRUE },
{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
- { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
- B_FALSE },
{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
B_TRUE},
@@ -3566,14 +4736,14 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
- B_TRUE },
+ { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
+ B_TRUE, B_TRUE },
{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE,
B_FALSE },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ B_TRUE },
{ zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
@@ -3582,13 +4752,9 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_TRUE },
{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
- B_FALSE },
{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
B_TRUE },
- { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
- { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
B_FALSE },
{ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
@@ -3597,15 +4763,30 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
DATASET_NAME, B_FALSE, B_FALSE },
{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
DATASET_NAME, B_FALSE, B_TRUE },
- { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_TRUE }
+ { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, B_FALSE },
+ { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME,
+ B_FALSE, B_FALSE },
+ { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
+ { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }
};
int
pool_status_check(const char *name, zfs_ioc_namecheck_t type)
{
spa_t *spa;
- char pool[ZFS_MAXNAMELEN];
int error;
ASSERT(type == POOL_NAME || type == DATASET_NAME);
@@ -3619,27 +4800,157 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type)
return (error);
}
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+ static minor_t last_minor;
+ minor_t m;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ for (m = last_minor + 1; m != last_minor; m++) {
+ if (m > ZFSDEV_MAX_MINOR)
+ m = 1;
+ if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+ last_minor = m;
+ return (m);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+ minor_t minor;
+ zfs_soft_state_t *zs;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (ENXIO);
+
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+ return (EAGAIN);
+
+ devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_CTLDEV;
+ zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+ return (0);
+}
+
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ zfs_onexit_destroy(zo);
+ ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+ zfs_soft_state_t *zp;
+
+ zp = ddi_get_soft_state(zfsdev_state, minor);
+ if (zp == NULL || zp->zss_type != which)
+ return (NULL);
+
+ return (zp->zss_data);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+ int error = 0;
+
+#ifdef sun
+ if (getminor(*devp) != 0)
+ return (zvol_open(devp, flag, otyp, cr));
+#endif
+
+ /* This is the control device. Allocate a new minor if requested. */
+ if (flag & FEXCL) {
+ mutex_enter(&zfsdev_state_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&zfsdev_state_lock);
+ }
+
+ return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+ zfs_onexit_t *zo;
+ minor_t minor = (minor_t)(uintptr_t)data;
+
+ if (minor == 0)
+ return;
+
+ mutex_enter(&zfsdev_state_lock);
+ zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (zo == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return;
+ }
+ zfs_ctldev_destroy(zo, minor);
+ mutex_exit(&zfsdev_state_lock);
+}
+
static int
zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
struct thread *td)
{
- zfs_cmd_t *zc = (void *)addr;
+ zfs_cmd_t *zc;
uint_t vec;
- int error;
+ int cflag, error, len;
+
+ cflag = ZFS_CMD_COMPAT_NONE;
+ len = IOCPARM_LEN(cmd);
/*
* Check if we have sufficient kernel memory allocated
* for the zfs_cmd_t request. Bail out if not so we
* will not access undefined memory region.
*/
- if (IOCPARM_LEN(cmd) < sizeof(zfs_cmd_t))
- return (EINVAL);
+ if (len < sizeof(zfs_cmd_t))
+ if (len == sizeof(zfs_cmd_v15_t)) {
+ cflag = ZFS_CMD_COMPAT_V15;
+ vec = zfs_ioctl_v15_to_v28[ZFS_IOC(cmd)];
+ } else
+ return (EINVAL);
+ else
+ vec = ZFS_IOC(cmd);
- vec = ZFS_IOC(cmd);
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ if (vec == ZFS_IOC_COMPAT_PASS)
+ return (0);
+ else if (vec == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+ }
if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
return (EINVAL);
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+ bzero(zc, sizeof(zfs_cmd_t));
+ zfs_cmd_compat_get(zc, addr, cflag);
+ zfs_ioctl_compat_pre(zc, &vec, cflag);
+ } else {
+ zc = (void *)addr;
+ }
+
error = zfs_ioc_vec[vec].zvec_secpolicy(zc, td->td_ucred);
/*
@@ -3648,6 +4959,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
*/
if (error == 0) {
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ zc->zc_iflags = flag & FKIOCTL;
switch (zfs_ioc_vec[vec].zvec_namecheck) {
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
@@ -3678,9 +4990,68 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
zfs_log_history(zc);
}
+ if (cflag != ZFS_CMD_COMPAT_NONE) {
+ zfs_ioctl_compat_post(zc, ZFS_IOC(cmd), cflag);
+ zfs_cmd_compat_put(zc, addr, cflag);
+ kmem_free(zc, sizeof(zfs_cmd_t));
+ }
+
return (error);
}
+#ifdef sun
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+ DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ zfs_dip = dip;
+
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy())
+ return (DDI_FAILURE);
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ zfs_dip = NULL;
+
+ ddi_prop_remove_all(dip);
+ ddi_remove_minor_node(dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = zfs_dip;
+ return (DDI_SUCCESS);
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ return (DDI_SUCCESS);
+ }
+
+ return (DDI_FAILURE);
+}
+#endif /* sun */
+
/*
* OK, so this is a little weird.
*
@@ -3690,8 +5061,60 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
* /dev/zfs has basically nothing to do except serve up ioctls,
* so most of the standard driver entry points are in zvol.c.
*/
+#ifdef sun
+static struct cb_ops zfs_cb_ops = {
+ zfsdev_open, /* open */
+ zfsdev_close, /* close */
+ zvol_strategy, /* strategy */
+ nodev, /* print */
+ zvol_dump, /* dump */
+ zvol_read, /* read */
+ zvol_write, /* write */
+ zfsdev_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
+ CB_REV, /* version */
+ nodev, /* async read */
+ nodev, /* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+ DEVO_REV, /* version */
+ 0, /* refcnt */
+ zfs_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ zfs_attach, /* attach */
+ zfs_detach, /* detach */
+ nodev, /* reset */
+ &zfs_cb_ops, /* driver operations */
+ NULL, /* no bus operations */
+ NULL, /* power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv zfs_modldrv = {
+ &mod_driverops,
+ "ZFS storage pool",
+ &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&zfs_modlfs,
+ (void *)&zfs_modldrv,
+ NULL
+};
+#endif /* sun */
+
static struct cdevsw zfs_cdevsw = {
.d_version = D_VERSION,
+ .d_open = zfsdev_open,
.d_ioctl = zfsdev_ioctl,
.d_name = ZFS_DEV_NAME
};
@@ -3716,6 +5139,69 @@ struct proc *zfsproc;
uint_t zfs_fsyncer_key;
extern uint_t rrw_tsd_key;
+#ifdef sun
+int
+_init(void)
+{
+ int error;
+
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ return (error);
+ }
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, NULL);
+
+ error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+ ASSERT(error == 0);
+ mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
+ return (EBUSY);
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ if (zfs_nfsshare_inited)
+ (void) ddi_modclose(nfs_mod);
+ if (zfs_smbshare_inited)
+ (void) ddi_modclose(smbsrv_mod);
+ if (zfs_nfsshare_inited || zfs_smbshare_inited)
+ (void) ddi_modclose(sharefs_mod);
+
+ tsd_destroy(&zfs_fsyncer_key);
+ ldi_ident_release(zfs_li);
+ zfs_li = NULL;
+ mutex_destroy(&zfs_share_lock);
+
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+#endif /* sun */
+
static int
zfs_modevent(module_t mod, int type, void *unused __unused)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index 310508875347..29378d8e71fc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -44,14 +43,6 @@
#include <sys/zfs_fuid.h>
#include <sys/dsl_dataset.h>
-#define ZFS_HANDLE_REPLAY(zilog, tx) \
- if (zilog->zl_replay) { \
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
- zilog->zl_replaying_seq; \
- return; \
- }
-
/*
* These zfs_log_* functions must be called within a dmu tx, in one
* of 2 contexts depending on zilog->z_replay:
@@ -180,6 +171,15 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+ XAT0_REPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ XAT0_OFFLINE;
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ XAT0_SPARSE;
}
static void *
@@ -241,7 +241,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
zfs_fuid_info_t *fuidp, vattr_t *vap)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
lr_acl_create_t *lracl;
size_t aclsize;
@@ -253,11 +252,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
size_t namesize = strlen(name) + 1;
size_t fuidsz = 0;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If we have FUIDs present then add in space for
* domains and ACE fuid's if any.
@@ -288,21 +285,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
- lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+ lr->lr_mode = zp->z_mode;
+ if (!IS_EPHEMERAL(zp->z_uid)) {
+ lr->lr_uid = (uint64_t)zp->z_uid;
} else {
lr->lr_uid = fuidp->z_fuid_owner;
}
- if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
- lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+ if (!IS_EPHEMERAL(zp->z_gid)) {
+ lr->lr_gid = (uint64_t)zp->z_gid;
} else {
lr->lr_gid = fuidp->z_fuid_group;
}
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- lr->lr_rdev = zp->z_phys->zp_rdev;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+ sizeof (lr->lr_rdev)) != 0)
+ lr->lr_rdev = 0;
/*
* Fill in xvattr info if any
@@ -341,9 +342,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
bcopy(name, end, namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -351,25 +350,23 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name)
+ znode_t *dzp, char *name, uint64_t foid)
{
itx_t *itx;
- uint64_t seq;
lr_remove_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
+ itx->itx_oid = foid;
+
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -380,24 +377,19 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
- uint64_t seq;
lr_link_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_link_obj = zp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -408,32 +400,28 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
size_t namesize = strlen(name) + 1;
size_t linksize = strlen(link) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_uid = zp->z_uid;
+ lr->lr_gid = zp->z_gid;
+ lr->lr_mode = zp->z_mode;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
bcopy(name, (char *)(lr + 1), namesize);
bcopy(link, (char *)(lr + 1) + namesize, linksize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -444,27 +432,22 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
- uint64_t seq;
lr_rename_t *lr;
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
lr->lr_tdoid = tdzp->z_id;
bcopy(sname, (char *)(lr + 1), snamesize);
bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ itx->itx_oid = szp->z_id;
- seq = zil_itx_assign(zilog, itx, tx);
- sdzp->z_last_itx = seq;
- tdzp->z_last_itx = seq;
- szp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -472,9 +455,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
ssize_t zfs_immediate_write_sz = 32768;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
- sizeof (lr_write_t))
-
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -482,37 +462,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t write_state;
boolean_t slogging;
uintptr_t fsync_cnt;
+ ssize_t immediate_write_sz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ ? 0 : zfs_immediate_write_sz;
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * In this mode, if we need to commit the write later, then the block
- * is immediately written into the file system (using dmu_sync),
- * and a pointer to the block is put into the log record.
- * When the txg commits the block is linked in.
- * This saves additionally writing the data into the log record.
- * There are a few requirements for this to occur:
- * - write is greater than zfs_immediate_write_sz
- * - not using slogs (as slogs are assumed to always be faster
- * than writing into the main pool)
- * - the write occupies only one block
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FSYNC or FDSYNC), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
- slogging = spa_has_slogs(zilog->zl_spa);
- if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
+ slogging = spa_has_slogs(zilog->zl_spa) &&
+ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
@@ -541,8 +501,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
+ zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
@@ -559,13 +518,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx->itx_private = zp->z_zfsvfs;
- if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
- (ioflag & (FSYNC | FDSYNC)))
- itx->itx_sync = B_TRUE;
- else
+ if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+ (fsync_cnt == 0))
itx->itx_sync = B_FALSE;
- zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+ zil_itx_assign(zilog, itx, tx);
off += len;
resid -= len;
@@ -580,14 +537,11 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
- uint64_t seq;
lr_truncate_t *lr;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_truncate_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
@@ -595,8 +549,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
lr->lr_length = len;
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -607,18 +560,14 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_setattr_t *lr;
xvattr_t *xvap = (xvattr_t *)vap;
size_t recsize = sizeof (lr_setattr_t);
void *start;
-
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If XVATTR set, then log record size needs to allow
* for lr_attr_t + xvattr mask, mapsize and create time
@@ -662,8 +611,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(void) zfs_log_fuid_domains(fuidp, start);
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -674,7 +622,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_acl_v0_t *lrv0;
lr_acl_t *lr;
int txtype;
@@ -682,11 +629,9 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
size_t txsize;
size_t aclbytes = vsecp->vsa_aclentsz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
TX_ACL_V0 : TX_ACL;
@@ -732,6 +677,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
}
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
new file mode 100644
index 000000000000..ca0acfd3206d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls. User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation. Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+ zfs_onexit_t *zo;
+
+ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+ offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+ zfs_onexit_action_node_t *ap;
+
+ mutex_enter(&zo->zo_lock);
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ mutex_enter(&zo->zo_lock);
+ }
+ mutex_exit(&zo->zo_lock);
+
+ list_destroy(&zo->zo_actions);
+ mutex_destroy(&zo->zo_lock);
+ kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (*zo == NULL)
+ return (EBADF);
+
+ return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ file_t *fp, *tmpfp;
+ zfs_onexit_t *zo;
+ void *data;
+ int error;
+
+ fp = getf(fd);
+ if (fp == NULL)
+ return (EBADF);
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ if (error == 0)
+ *minorp = (minor_t)(uintptr_t)data;
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (error);
+ return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+ list_link_init(&ap->za_link);
+ ap->za_func = func;
+ ap->za_data = data;
+
+ mutex_enter(&zo->zo_lock);
+ list_insert_tail(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (action_handle)
+ *action_handle = (uint64_t)(uintptr_t)ap;
+
+ return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+ zfs_onexit_action_node_t *match;
+ zfs_onexit_action_node_t *ap;
+ list_t *l;
+
+ ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+ match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+ l = &zo->zo_actions;
+ for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+ if (match == ap)
+ break;
+ }
+ return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (fire)
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ } else {
+ mutex_exit(&zo->zo_lock);
+ error = ENOENT;
+ }
+
+ return (error);
+}
+
+/*
+ * Return the data associated with this callback. This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ *data = NULL;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL)
+ *data = ap->za_data;
+ else
+ error = ENOENT;
+ mutex_exit(&zo->zo_lock);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index c96524726f13..ebea17a3a32a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -132,6 +129,12 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
}
static int
@@ -516,7 +519,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
break;
case TX_MKXATTR:
- name = (char *)(lr + 1);
error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
break;
case TX_SYMLINK:
@@ -531,10 +533,8 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
VOP_UNLOCK(ZTOV(dzp), 0);
out:
- if (error == 0 && vp != NULL) {
- VOP_UNLOCK(vp, 0);
- VN_RELE(vp);
- }
+ if (error == 0 && vp != NULL)
+ VN_URELE(vp);
VN_RELE(ZTOV(dzp));
@@ -588,6 +588,7 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
}
vput(vp);
VOP_UNLOCK(ZTOV(dzp), 0);
+
fail:
VN_RELE(ZTOV(dzp));
@@ -616,6 +617,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
+
cn.cn_nameptr = name;
cn.cn_cred = kcred;
cn.cn_thread = curthread;
@@ -710,7 +712,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
znode_t *zp;
int error;
ssize_t resid;
- uint64_t orig_eof, eod;
+ uint64_t eod, offset, length;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -725,15 +727,10 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
error = 0;
return (error);
}
- orig_eof = zp->z_phys->zp_size;
- eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
-
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(zfsvfs->z_log, lr);
- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
- lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+ eod = offset + length; /* end of data for this write */
/*
* This may be a write from a dmu_sync() for a whole block,
@@ -741,12 +738,29 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
* We can't just replay what was written for this TX_WRITE as
* a future TX_WRITE2 may extend the eof and the data for that
* write needs to be there. So we write the whole block and
- * reduce the eof.
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
*/
- if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
- zp->z_phys->zp_size = eod;
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
+ }
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+ UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
VN_RELE(ZTOV(zp));
+ zfsvfs->z_replay_eof = 0; /* safety */
return (error);
}
@@ -767,21 +781,34 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log writes out of order, it's possible the
- * file has been removed. In this case just drop the write
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
+top:
end = lr->lr_offset + lr->lr_length;
- if (end > zp->z_phys->zp_size) {
- ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
- zp->z_phys->zp_size = end;
+ if (end > zp->z_size) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
}
VN_RELE(ZTOV(zp));
@@ -792,9 +819,33 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
static int
zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
{
+#ifdef sun
+ znode_t *zp;
+ flock64_t fl;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&fl, sizeof (fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = 0;
+ fl.l_start = lr->lr_offset;
+ fl.l_len = lr->lr_length;
+ error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+ lr->lr_offset, kcred, NULL);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+#else /* !sun */
ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
return (EOPNOTSUPP);
+#endif /* !sun */
}
static int
@@ -816,16 +867,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log setattrs out of order, it's possible the
- * file has been removed. In this case just drop the setattr
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -874,16 +917,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -935,16 +970,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
}
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
#ifdef TODO
bzero(&vsa, sizeof (vsa));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index 4de8d8a2dfed..7fd8f6020d08 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -112,7 +112,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
- * a z_phys or z_zfsvfs - so skip that processing.
+ * a "sa" data or z_zfsvfs - so skip that processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
@@ -125,14 +125,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
+ new->r_off = zp->z_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ end_size = MAX(zp->z_size, new->r_off + len);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
new->r_off = 0;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
new file mode 100644
index 000000000000..d141e43d722a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively. The file system
+ * could have other attributes stored in files, but they will be
+ * ignored. The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+ {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+ {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ size_t bufsz;
+ int error;
+
+ bufsz = zp->z_size;
+ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+ error = uiomove((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+ 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ }
+ return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+ VERIFY(dmu_set_bonus(db,
+ len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ if (len) {
+ bcopy(link, (caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ }
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_grow_blocksize(zp, len, tx);
+ VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+ zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa) {
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp)) != 0)
+ return;
+ } else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+ return;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ if (len <= doi.doi_bonus_size) {
+ (void) memcpy(xoap->xoa_av_scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ sizeof (xoap->xoa_av_scanstamp));
+ }
+ }
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa)
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp), tx));
+ else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+ if (len > doi.doi_bonus_size)
+ VERIFY(dmu_set_bonus(db, len, tx) == 0);
+ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+ zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ &zp->z_pflags, sizeof (uint64_t), tx));
+ }
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to know performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(hdl);
+ znode_t *zp = sa_get_userdata(hdl);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ sa_bulk_attr_t bulk[20];
+ int count = 0;
+ sa_bulk_attr_t sa_attrs[20] = { 0 };
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr, parent;
+ uint64_t crtime[2], mtime[2], ctime[2];
+ zfs_acl_phys_t znode_acl;
+ char scanstamp[AV_SCANSTAMP_SZ];
+ boolean_t drop_lock = B_FALSE;
+
+ /*
+ * No upgrade if ACL isn't cached
+ * since we won't know which locks are held
+ * and ready the ACL would require special "locked"
+ * interfaces that would be messy
+ */
+ if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
+ return;
+
+ /*
+ * If the z_lock is held and we aren't the owner
+ * the just return since we don't want to deadlock
+ * trying to update the status of z_is_sa. This
+ * file can then be upgraded at a later time.
+ *
+ * Otherwise, we know we are doing the
+ * sa_update() that caused us to enter this function.
+ */
+ if (mutex_owner(&zp->z_lock) != curthread) {
+ if (mutex_tryenter(&zp->z_lock) == 0)
+ return;
+ else
+ drop_lock = B_TRUE;
+ }
+
+ /* First do a bulk query of the attributes that aren't cached */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+
+ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+ goto done;
+
+
+ /*
+ * While the order here doesn't matter its best to try and organize
+ * it is such a way to pick up an already existing layout number
+ */
+ count = 0;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+ NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+ if (xattr)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+ NULL, &xattr, 8);
+
+ /* if scanstamp then add scanstamp */
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+ NULL, scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+ count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj)
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+
+ zp->z_is_sa = B_TRUE;
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+ if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+ return;
+
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ if (zfs_external_acl(zp)) {
+ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+ DMU_OBJECT_END);
+ }
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 287de4ce7573..e9a956ce4bb4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -45,6 +46,7 @@
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/varargs.h>
#include <sys/policy.h>
#include <sys/atomic.h>
@@ -55,17 +57,19 @@
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
+#include <sys/sa.h>
+#include "zfs_comutil.h"
struct mtx zfs_debug_mtx;
MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
-int zfs_super_owner = 0;
+int zfs_super_owner;
SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
"File system owner can perform privileged operation on his file systems");
-int zfs_debug_level = 0;
+int zfs_debug_level;
TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
"Debug level");
@@ -74,12 +78,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
static int zfs_version_acl = ZFS_ACL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
"ZFS_ACL_VERSION");
-static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
- &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
-static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
- &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
static int zfs_version_spa = SPA_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
"SPA_VERSION");
@@ -156,9 +154,8 @@ zfs_sync(vfs_t *vfsp, int waitfor)
}
if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
- else
- txg_wait_synced(dp, 0);
+ zil_commit(zfsvfs->z_log, 0);
+
ZFS_EXIT(zfsvfs);
} else {
/*
@@ -172,6 +169,60 @@ zfs_sync(vfs_t *vfsp, int waitfor)
return (0);
}
+#ifndef __FreeBSD__
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+ major_t new_major;
+
+ do {
+ ASSERT3U(zfs_minor, <=, MAXMIN32);
+ minor_t start = zfs_minor;
+ do {
+ mutex_enter(&zfs_dev_mtx);
+ if (zfs_minor >= MAXMIN32) {
+ /*
+ * If we're still using the real major
+ * keep out of /dev/zfs and /dev/zvol minor
+ * number space. If we're using a getudev()'ed
+ * major number, we can use all of its minors.
+ */
+ if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+ zfs_minor = ZFS_MIN_MINOR;
+ else
+ zfs_minor = 0;
+ } else {
+ zfs_minor++;
+ }
+ *dev = makedevice(zfs_major, zfs_minor);
+ mutex_exit(&zfs_dev_mtx);
+ } while (vfs_devismounted(*dev) && zfs_minor != start);
+ if (zfs_minor == start) {
+ /*
+ * We are using all ~262,000 minor numbers for the
+ * current major number. Create a new major number.
+ */
+ if ((new_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_mount: Can't get unique major "
+ "device number.");
+ return (-1);
+ }
+ mutex_enter(&zfs_dev_mtx);
+ zfs_major = new_major;
+ zfs_minor = 0;
+
+ mutex_exit(&zfs_dev_mtx);
+ } else {
+ break;
+ }
+ /* CONSTANTCONDITION */
+ } while (1);
+
+ return (0);
+}
+#endif /* !__FreeBSD__ */
+
static void
atime_changed_cb(void *arg, uint64_t newval)
{
@@ -313,14 +364,6 @@ vscan_changed_cb(void *arg, uint64_t newval)
}
static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_mode = newval;
-}
-
-static void
acl_inherit_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
@@ -335,11 +378,11 @@ zfs_register_callbacks(vfs_t *vfsp)
objset_t *os = NULL;
zfsvfs_t *zfsvfs = NULL;
uint64_t nbmand;
- int readonly, do_readonly = FALSE;
- int setuid, do_setuid = FALSE;
- int exec, do_exec = FALSE;
- int xattr, do_xattr = FALSE;
- int atime, do_atime = FALSE;
+ int readonly, do_readonly = B_FALSE;
+ int setuid, do_setuid = B_FALSE;
+ int exec, do_exec = B_FALSE;
+ int xattr, do_xattr = B_FALSE;
+ int atime, do_atime = B_FALSE;
int error = 0;
ASSERT(vfsp);
@@ -360,7 +403,8 @@ zfs_register_callbacks(vfs_t *vfsp)
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
readonly = B_TRUE;
do_readonly = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
@@ -444,8 +488,6 @@ zfs_register_callbacks(vfs_t *vfsp)
error = error ? error : dsl_prop_register(ds,
"snapdir", snapdir_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
- "aclmode", acl_mode_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
"aclinherit", acl_inherit_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
"vscan", vscan_changed_cb, zfsvfs);
@@ -483,7 +525,6 @@ unregister:
(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
zfsvfs);
(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
@@ -491,62 +532,53 @@ unregister:
}
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
- int64_t delta, dmu_tx_t *tx)
-{
- uint64_t used = 0;
- char buf[32];
- int err;
- uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
- if (delta == 0)
- return;
-
- (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
- err = zap_lookup(os, obj, buf, 8, 1, &used);
- ASSERT(err == 0 || err == ENOENT);
- /* no underflow/overflow */
- ASSERT(delta > 0 || used >= -delta);
- ASSERT(delta < 0 || used + delta > used);
- used += delta;
- if (used == 0)
- err = zap_remove(os, obj, buf, tx);
- else
- err = zap_update(os, obj, buf, 8, 1, &used, tx);
- ASSERT(err == 0);
-}
-
-static void
-zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
- void *oldbonus, void *newbonus,
- uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+ uint64_t *userp, uint64_t *groupp)
{
- znode_phys_t *oldznp = oldbonus;
- znode_phys_t *newznp = newbonus;
+ znode_phys_t *znp = data;
+ int error = 0;
- if (bonustype != DMU_OT_ZNODE)
- return;
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (ENOENT);
- /* We charge 512 for the dnode (if it's allocated). */
- if (oldznp->zp_gen != 0)
- oldused += DNODE_SIZE;
- if (newznp->zp_gen != 0)
- newused += DNODE_SIZE;
+ /*
+ * If we have a NULL data pointer
+ * then assume the id's aren't changing and
+ * return EEXIST to the dmu to let it know to
+ * use the same ids
+ */
+ if (data == NULL)
+ return (EEXIST);
- if (oldznp->zp_uid == newznp->zp_uid) {
- uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+ if (bonustype == DMU_OT_ZNODE) {
+ *userp = znp->zp_uid;
+ *groupp = znp->zp_gid;
} else {
- uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
- uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
- }
+ int hdrsize;
- if (oldznp->zp_gid == newznp->zp_gid) {
- uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
- } else {
- uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
- uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+ ASSERT(bonustype == DMU_OT_SA);
+ hdrsize = sa_hdrsize(data);
+
+ if (hdrsize != 0) {
+ *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_UID_OFFSET));
+ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_GID_OFFSET));
+ } else {
+ /*
+ * This should only happen for newly created
+ * files that haven't had the znode data filled
+ * in yet.
+ */
+ *userp = 0;
+ *groupp = 0;
+ }
}
+ return (error);
}
static void
@@ -733,7 +765,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
}
boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
@@ -756,33 +788,48 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
return (used >= quota);
}
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+ uint64_t fuid;
+ uint64_t quotaobj;
+
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ fuid = isgroup ? zp->z_gid : zp->z_uid;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
int
-zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
objset_t *os;
zfsvfs_t *zfsvfs;
uint64_t zval;
int i, error;
+ uint64_t sa_obj;
- if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
- return (error);
- if (zval)
- mode |= DS_MODE_READONLY;
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
- if (error == EROFS) {
- mode |= DS_MODE_READONLY;
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
- }
- if (error)
+ /*
+ * We claim to always be readonly so we can open snapshots;
+ * other ZPL code will prevent us from writing to snapshots.
+ */
+ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+ if (error) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
+ }
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
*/
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
@@ -792,15 +839,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
if (error) {
goto out;
- } else if (zfsvfs->z_version > ZPL_VERSION) {
- (void) printf("Mismatched versions: File system "
- "is version %llu on-disk format, which is "
- "incompatible with this software version %lld!",
- (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+ } else if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printf("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
error = ENOTSUP;
goto out;
}
-
if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
goto out;
zfsvfs->z_norm = (int)zval;
@@ -822,6 +869,29 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error)
+ return (error);
+ } else {
+ /*
+ * Pre SA versions file systems should never touch
+ * either the attribute registration or layout objects.
+ */
+ sa_obj = 0;
+ }
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error)
+ goto out;
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
@@ -857,7 +927,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
goto out;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
@@ -867,12 +936,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- *zvp = zfsvfs;
+ *zfvp = zfsvfs;
return (0);
out:
- dmu_objset_close(os);
- *zvp = NULL;
+ dmu_objset_disown(os, zfsvfs);
+ *zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
@@ -889,15 +958,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
/*
* Set the objset user_ptr to track its zfsvfs.
*/
- mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
- if (zil_disable) {
- zil_destroy(zfsvfs->z_log, B_FALSE);
- zfsvfs->z_log = NULL;
- }
/*
* If we are not mounting (ie: online recv), then we don't
@@ -917,37 +982,42 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
else
zfs_unlinked_drain(zfsvfs);
- if (zfsvfs->z_log) {
- /*
- * Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest
- * doesn't use readonly mounts, where
- * zfs_unlinked_drain() isn't called.) This is because
- * ziltest causes spa_sync() to think it's committed,
- * but actually it is not, so the intent log contains
- * many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated
- * in a yet later txg. This would write a "create
- * object N" record to the intent log. Normally, this
- * would be fine because the spa_sync() would have
- * written out the fact that object N is free, before
- * we could write the "create object N" intent log
- * record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
- */
- zfsvfs->z_replay = B_TRUE;
- zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
- zfsvfs->z_replay = B_FALSE;
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
@@ -974,7 +1044,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
- mutex_destroy(&zfsvfs->z_online_recv_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrw_destroy(&zfsvfs->z_teardown_lock);
@@ -989,13 +1058,24 @@ static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
- vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ if (zfsvfs->z_vfs) {
+ if (zfsvfs->z_use_fuids) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ } else {
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ }
}
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
static int
@@ -1009,7 +1089,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
ASSERT(vfsp);
ASSERT(osname);
- error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+ error = zfsvfs_create(osname, &zfsvfs);
if (error)
return (error);
zfsvfs->z_vfs = vfsp;
@@ -1026,7 +1106,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
-
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
@@ -1053,6 +1132,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
@@ -1063,10 +1143,11 @@ zfs_domount(vfs_t *vfsp, char *osname)
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
- mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
} else {
error = zfsvfs_setup(zfsvfs, B_TRUE);
}
@@ -1080,7 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
zfsctl_create(zfsvfs);
out:
if (error) {
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
} else {
atomic_add_32(&zfs_active_fs_count, 1);
@@ -1121,9 +1202,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
VERIFY(dsl_prop_unregister(ds, "aclinherit",
acl_inherit_changed_cb, zfsvfs) == 0);
@@ -1132,6 +1210,302 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
}
}
+#ifdef SECLABEL
+/*
+ * Convert a decimal digit string to a uint64_t integer.
+ */
+static int
+str_to_uint64(char *str, uint64_t *objnum)
+{
+ uint64_t num = 0;
+
+ while (*str) {
+ if (*str < '0' || *str > '9')
+ return (EINVAL);
+
+ num = num*10 + *str++ - '0';
+ }
+
+ *objnum = num;
+ return (0);
+}
+
+/*
+ * The boot path passed from the boot loader is in the form of
+ * "rootpool-name/root-filesystem-object-number'. Convert this
+ * string to a dataset name: "rootpool-name/root-filesystem-name".
+ */
+static int
+zfs_parse_bootfs(char *bpath, char *outpath)
+{
+ char *slashp;
+ uint64_t objnum;
+ int error;
+
+ if (*bpath == 0 || *bpath == '/')
+ return (EINVAL);
+
+ (void) strcpy(outpath, bpath);
+
+ slashp = strchr(bpath, '/');
+
+ /* if no '/', just return the pool name */
+ if (slashp == NULL) {
+ return (0);
+ }
+
+ /* if not a number, just return the root dataset name */
+ if (str_to_uint64(slashp+1, &objnum)) {
+ return (0);
+ }
+
+ *slashp = '\0';
+ error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
+ *slashp = '/';
+
+ return (error);
+}
+
+/*
+ * zfs_check_global_label:
+ * Check that the hex label string is appropriate for the dataset
+ * being mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (EACCES);
+ return (rdonly ? 0 : EACCES);
+ }
+ return (EACCES);
+}
+
+/*
+ * zfs_mount_label_policy:
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into. If the dataset has
+ * no label, create one.
+ *
+ * Returns:
+ * 0 : access allowed
+ * >0 : error code, such as EACCES
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+ int error, retv;
+ zone_t *mntzone = NULL;
+ ts_label_t *mnt_tsl;
+ bslabel_t *mnt_sl;
+ bslabel_t ds_sl;
+ char ds_hexsl[MAXNAMELEN];
+
+ retv = EACCES; /* assume the worst */
+
+ /*
+ * Start by getting the dataset label if it exists.
+ */
+ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (EACCES);
+
+ /*
+ * If labeling is NOT enabled, then disallow the mount of datasets
+ * which have a non-default label already. No other label checks
+ * are needed.
+ */
+ if (!is_system_labeled()) {
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ return (EACCES);
+ }
+
+ /*
+ * Get the label of the mountpoint. If mounting into the global
+ * zone (i.e. mountpoint is not within an active zone and the
+ * zoned property is off), the label must be default or
+ * admin_low/admin_high only; no other checks are needed.
+ */
+ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+ if (mntzone->zone_id == GLOBAL_ZONEID) {
+ uint64_t zoned;
+
+ zone_rele(mntzone);
+
+ if (dsl_prop_get_integer(osname,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (EACCES);
+ if (!zoned)
+ return (zfs_check_global_label(osname, ds_hexsl));
+ else
+ /*
+ * This is the case of a zone dataset being mounted
+ * initially, before the zone has been fully created;
+ * allow this mount into global zone.
+ */
+ return (0);
+ }
+
+ mnt_tsl = mntzone->zone_slabel;
+ ASSERT(mnt_tsl != NULL);
+ label_hold(mnt_tsl);
+ mnt_sl = label2bslabel(mnt_tsl);
+
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+ /*
+ * The dataset doesn't have a real label, so fabricate one.
+ */
+ char *str = NULL;
+
+ if (l_to_str_internal(mnt_sl, &str) == 0 &&
+ dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+ retv = 0;
+ if (str != NULL)
+ kmem_free(str, strlen(str) + 1);
+ } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+ /*
+ * Now compare labels to complete the MAC check. If the
+ * labels are equal then allow access. If the mountpoint
+ * label dominates the dataset label, allow readonly access.
+ * Otherwise, access is denied.
+ */
+ if (blequal(mnt_sl, &ds_sl))
+ retv = 0;
+ else if (bldominates(mnt_sl, &ds_sl)) {
+ vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+ retv = 0;
+ }
+ }
+
+ label_rele(mnt_tsl);
+ zone_rele(mntzone);
+ return (retv);
+}
+#endif /* SECLABEL */
+
+#ifdef OPENSOLARIS_MOUNTROOT
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+ int error = 0;
+ static int zfsrootdone = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ znode_t *zp = NULL;
+ vnode_t *vp = NULL;
+ char *zfs_bootfs;
+ char *zfs_devid;
+
+ ASSERT(vfsp);
+
+ /*
+ * The filesystem that we mount as root is defined in the
+ * boot property "zfs-bootfs" with a format of
+ * "poolname/root-dataset-objnum".
+ */
+ if (why == ROOT_INIT) {
+ if (zfsrootdone++)
+ return (EBUSY);
+ /*
+ * the process of doing a spa_load will require the
+ * clock to be set before we could (for example) do
+ * something better by looking at the timestamp on
+ * an uberblock, so just set it to -1.
+ */
+ clkset(-1);
+
+ if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
+ cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
+ "bootfs name");
+ return (EINVAL);
+ }
+ zfs_devid = spa_get_bootprop("diskdevid");
+ error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
+ if (zfs_devid)
+ spa_free_bootprop(zfs_devid);
+ if (error) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
+ error);
+ return (error);
+ }
+ if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
+ error);
+ return (error);
+ }
+
+ spa_free_bootprop(zfs_bootfs);
+
+ if (error = vfs_lock(vfsp))
+ return (error);
+
+ if (error = zfs_domount(vfsp, rootfs.bo_name)) {
+ cmn_err(CE_NOTE, "zfs_domount: error %d", error);
+ goto out;
+ }
+
+ zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
+ cmn_err(CE_NOTE, "zfs_zget: error %d", error);
+ goto out;
+ }
+
+ vp = ZTOV(zp);
+ mutex_enter(&vp->v_lock);
+ vp->v_flag |= VROOT;
+ mutex_exit(&vp->v_lock);
+ rootvp = vp;
+
+ /*
+ * Leave rootvp held. The root file system is never unmounted.
+ */
+
+ vfs_add((struct vnode *)0, vfsp,
+ (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+ vfs_unlock(vfsp);
+ return (error);
+ } else if (why == ROOT_REMOUNT) {
+ readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+ vfsp->vfs_flag |= VFS_REMOUNT;
+
+ /* refresh mount options */
+ zfs_unregister_callbacks(vfsp->vfs_data);
+ return (zfs_register_callbacks(vfsp));
+
+ } else if (why == ROOT_UNMOUNT) {
+ zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+ (void) zfs_sync(vfsp, 0, 0);
+ return (0);
+ }
+
+ /*
+ * if "why" is equal to anything else other than ROOT_INIT,
+ * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+ */
+ return (ENOTSUP);
+}
+#endif /* OPENSOLARIS_MOUNTROOT */
+
/*ARGSUSED*/
static int
zfs_mount(vfs_t *vfsp)
@@ -1203,6 +1577,12 @@ zfs_mount(vfs_t *vfsp)
goto out;
}
+#ifdef SECLABEL
+ error = zfs_mount_label_policy(vfsp, osname);
+ if (error)
+ goto out;
+#endif
+
vfsp->vfs_flag |= MNT_NFS4ACLS;
/*
@@ -1291,6 +1671,25 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp)
return (0);
}
+int
+zfs_vnode_lock(vnode_t *vp, int flags)
+{
+ int error;
+
+ ASSERT(vp != NULL);
+
+ /*
+ * Check if the file system wasn't forcibly unmounted in the meantime.
+ */
+ error = vn_lock(vp, flags);
+ if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
+ VOP_UNLOCK(vp, 0);
+ error = ENOENT;
+ }
+
+ return (error);
+}
+
static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
@@ -1301,14 +1700,18 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
ZFS_ENTER_NOERROR(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
ZFS_EXIT(zfsvfs);
if (error == 0) {
- *vpp = ZTOV(rootzp);
- error = vn_lock(*vpp, flags);
- (*vpp)->v_vflag |= VV_ROOT;
+ error = zfs_vnode_lock(*vpp, flags);
+ if (error == 0)
+ (*vpp)->v_vflag |= VV_ROOT;
}
+ if (error != 0)
+ *vpp = NULL;
return (error);
}
@@ -1371,7 +1774,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp))
- if (zp->z_dbuf) {
+ if (zp->z_sa_hdl) {
ASSERT(ZTOV(zp)->v_count >= 0);
zfs_znode_dmu_fini(zp);
}
@@ -1416,10 +1819,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
/*
* Evict cached data
*/
- if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
- }
+ if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
@@ -1440,6 +1843,7 @@ zfs_umount(vfs_t *vfsp, int fflag)
ZFS_DELEG_PERM_MOUNT, cr))
return (ret);
}
+
/*
* We purge the parent filesystem's vfsp as the parent filesystem
* and all of its snapshots have their vnode's v_vfsp set to the
@@ -1525,14 +1929,14 @@ zfs_umount(vfs_t *vfsp, int fflag)
/*
* Unset the objset user_ptr.
*/
- mutex_enter(&os->os->os_user_ptr_lock);
+ mutex_enter(&os->os_user_ptr_lock);
dmu_objset_set_user(os, NULL);
- mutex_exit(&os->os->os_user_ptr_lock);
+ mutex_exit(&os->os_user_ptr_lock);
/*
* Finally release the objset
*/
- dmu_objset_close(os);
+ dmu_objset_disown(os, zfsvfs);
}
/*
@@ -1572,13 +1976,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
VN_RELE(ZTOV(zp));
err = EINVAL;
}
+ if (err == 0)
+ *vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
+ if (err == 0)
+ err = zfs_vnode_lock(*vpp, flags);
if (err != 0)
*vpp = NULL;
- else {
- *vpp = ZTOV(zp);
- vn_lock(*vpp, flags);
- }
return (err);
}
@@ -1611,7 +2015,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
uint64_t fid_gen = 0;
uint64_t gen_mask;
uint64_t zp_gen;
- int i, err;
+ int i, err;
*vpp = NULL;
@@ -1665,8 +2069,10 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
VN_HOLD(*vpp);
}
ZFS_EXIT(zfsvfs);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- return (0);
+ err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
}
gen_mask = -1ULL >> (64 - 8 * i);
@@ -1676,7 +2082,9 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
ZFS_EXIT(zfsvfs);
return (err);
}
- zp_gen = zp->z_phys->zp_gen & gen_mask;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
if (zp_gen == 0)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
@@ -1686,12 +2094,14 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
return (EINVAL);
}
- ZFS_EXIT(zfsvfs);
-
*vpp = ZTOV(zp);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
- return (0);
+ ZFS_EXIT(zfsvfs);
+ err = zfs_vnode_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ if (err == 0)
+ vnode_create_vobject(*vpp, zp->z_size, curthread);
+ else
+ *vpp = NULL;
+ return (err);
}
/*
@@ -1701,17 +2111,13 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
* 'z_teardown_inactive_lock' write held.
*/
int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
-
- *modep = zfsvfs->z_os->os_mode;
- if (name)
- dmu_objset_name(zfsvfs->z_os, name);
- dmu_objset_close(zfsvfs->z_os);
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
return (0);
}
@@ -1720,21 +2126,49 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
* Reopen zfsvfs_t::z_os and release VOPs.
*/
int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
{
int err;
ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
- err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
+ &zfsvfs->z_os);
if (err) {
zfsvfs->z_os = NULL;
} else {
znode_t *zp;
+ uint64_t sa_obj = 0;
+
+ /*
+ * Make sure version hasn't changed
+ */
+
+ err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
+ &zfsvfs->z_version);
+
+ if (err)
+ goto bail;
+
+ err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+ if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
+ goto bail;
+
+ if ((err = sa_setup(zfsvfs->z_os, sa_obj,
+ zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0)
+ goto bail;
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(zfsvfs->z_os,
+ zfs_sa_upgrade);
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+ zfs_set_fuid_feature(zfsvfs);
+
/*
* Attempt to re-establish all the active znodes with
* their dbufs. If a zfs_rezget() fails, then we'll let
@@ -1747,17 +2181,17 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
-
}
+bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
if (err) {
/*
- * Since we couldn't reopen zfsvfs::z_os, force
- * unmount this file system.
+ * Since we couldn't reopen zfsvfs::z_os, or
+ * setup the sa framework force unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
@@ -1773,9 +2207,11 @@ zfs_freevfs(vfs_t *vfsp)
#ifdef sun
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
- * from zfs_mount(). Release it here.
+ * from zfs_mount(). Release it here. If we came through
+ * zfs_mountroot() instead, we didn't grab an extra hold, so
+ * skip the VFS_RELE for rootvfs.
*/
- if (zfsvfs->z_issnap)
+ if (zfsvfs->z_issnap && (vfsp != rootvfs))
VFS_RELE(zfsvfs->z_parent->z_vfs);
#endif /* sun */
@@ -1825,17 +2261,17 @@ zfs_init(void)
printf("ZFS filesystem version " ZPL_VERSION_STRING "\n");
/*
- * Initialize znode cache, vnode ops, etc...
+ * Initialize .zfs directory structures
*/
- zfs_znode_init();
+ zfsctl_init();
/*
- * Initialize .zfs directory structures
+ * Initialize znode cache, vnode ops, etc...
*/
- zfsctl_init();
+ zfs_znode_init();
/*
- * Reduce number of vnode. Originally number of vnodes is calculated
+ * Reduce number of vnodes. Originally number of vnodes is calculated
* with UFS inode in mind. We reduce it here, because it's too big for
* ZFS/i386.
*/
@@ -1871,13 +2307,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
if (newvers < zfsvfs->z_version)
return (EINVAL);
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (ENOTSUP);
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
+
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
@@ -1886,20 +2332,35 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (error);
}
- spa_history_internal_log(LOG_DS_UPGRADE,
- dmu_objset_spa(os), tx, CRED(),
- "oldver=%llu newver=%llu dataset = %llu",
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal(LOG_DS_UPGRADE,
+ dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
zfsvfs->z_version, newvers, dmu_objset_id(os));
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
- if (zfsvfs->z_version >= ZPL_VERSION_FUID)
- zfs_set_fuid_feature(zfsvfs);
+ zfs_set_fuid_feature(zfsvfs);
return (0);
}
+
/*
* Read a property stored within the master node.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index f2fdb7a38324..795a7bd216b4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -23,6 +23,7 @@
*/
/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
@@ -47,10 +48,12 @@
#include <sys/zfs_ioctl.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/dirent.h>
#include <sys/policy.h>
#include <sys/sunddi.h>
@@ -58,6 +61,7 @@
#include <sys/sid.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
@@ -122,7 +126,7 @@
* (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors.
*
- * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
* to ensure that synchronous semantics are provided when necessary.
*
* In general, this is how things should be ordered in each vnode op:
@@ -154,7 +158,7 @@
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
- * zil_commit(zilog, seq, foid); // synchronous when necessary
+ * zil_commit(zilog, foid); // synchronous when necessary
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
@@ -169,7 +173,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & FAPPEND) == 0)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
@@ -177,8 +181,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0) {
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
if (fs_vscan(*vpp, cr, 0) != 0) {
ZFS_EXIT(zfsvfs);
return (EACCES);
@@ -216,8 +219,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0)
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
VERIFY(fs_vscan(vp, cr, 1) == 0);
ZFS_EXIT(zfsvfs);
@@ -237,7 +239,7 @@ zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
int error;
boolean_t hole;
- file_sz = zp->z_phys->zp_size;
+ file_sz = zp->z_size;
if (noff >= file_sz) {
return (ENXIO);
}
@@ -370,7 +372,6 @@ zfs_unmap_page(struct sf_buf *sf)
sf_buf_free(sf);
}
-
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
@@ -378,7 +379,6 @@ zfs_unmap_page(struct sf_buf *sf)
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
-
static void
update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
int segflg, dmu_tx_t *tx)
@@ -420,6 +420,71 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
}
/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain VPO_BUSY on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ struct sf_buf *sf;
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(uio->uio_segflg == UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+ ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+ VM_OBJECT_LOCK(obj);
+ for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+ int bytes = MIN(PAGESIZE, len);
+
+ pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
+ if (pp->valid == 0) {
+ vm_page_io_start(pp);
+ VM_OBJECT_UNLOCK(obj);
+ va = zfs_map_page(pp, &sf);
+ error = dmu_read(os, zp->z_id, start, bytes, va,
+ DMU_READ_PREFETCH);
+ if (bytes != PAGESIZE && error == 0)
+ bzero(va + bytes, PAGESIZE - bytes);
+ zfs_unmap_page(sf);
+ VM_OBJECT_LOCK(obj);
+ vm_page_io_finish(pp);
+ vm_page_lock(pp);
+ if (error) {
+ vm_page_free(pp);
+ } else {
+ pp->valid = VM_PAGE_BITS_ALL;
+ vm_page_activate(pp);
+ }
+ vm_page_unlock(pp);
+ }
+ if (error)
+ break;
+ uio->uio_resid -= bytes;
+ uio->uio_offset += bytes;
+ len -= bytes;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
@@ -435,14 +500,11 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
znode_t *zp = VTOZ(vp);
objset_t *os = zp->z_zfsvfs->z_os;
vm_object_t obj;
- vm_page_t m;
- struct sf_buf *sf;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
- uint64_t dirbytes;
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
@@ -450,98 +512,25 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
start = uio->uio_loffset;
off = start & PAGEOFFSET;
- dirbytes = 0;
VM_OBJECT_LOCK(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- int bytes = MIN(PAGESIZE - off, len);
+ vm_page_t pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
-again:
- if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
- vm_page_is_valid(m, off, bytes)) {
- if ((m->oflags & VPO_BUSY) != 0) {
- /*
- * Reference the page before unlocking and
- * sleeping so that the page daemon is less
- * likely to reclaim it.
- */
- vm_page_lock_queues();
- vm_page_flag_set(m, PG_REFERENCED);
- vm_page_sleep(m, "zfsmrb");
- goto again;
- }
+ if (pp = page_lookup(vp, start, off, bytes)) {
+ struct sf_buf *sf;
+ caddr_t va;
- vm_page_busy(m);
VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0)
- uiomove_fromphys(&m, off, bytes, uio);
+ va = zfs_map_page(pp, &sf);
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+ zfs_unmap_page(sf);
VM_OBJECT_LOCK(obj);
- vm_page_wakeup(m);
- } else if (uio->uio_segflg == UIO_NOCOPY) {
- /*
- * The code below is here to make sendfile(2) work
- * correctly with ZFS. As pointed out by ups@
- * sendfile(2) should be changed to use VOP_GETPAGES(),
- * but it pessimize performance of sendfile/UFS, that's
- * why I handle this special case in ZFS code.
- */
- KASSERT(off == 0,
- ("unexpected offset in mappedread for sendfile"));
- if (m != NULL && (m->oflags & VPO_BUSY) != 0) {
- /*
- * Reference the page before unlocking and
- * sleeping so that the page daemon is less
- * likely to reclaim it.
- */
- vm_page_lock_queues();
- vm_page_flag_set(m, PG_REFERENCED);
- vm_page_sleep(m, "zfsmrb");
- goto again;
- } else if (m == NULL) {
- m = vm_page_alloc(obj, OFF_TO_IDX(start),
- VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL);
- if (m == NULL) {
- VM_OBJECT_UNLOCK(obj);
- VM_WAIT;
- VM_OBJECT_LOCK(obj);
- goto again;
- }
- }
- vm_page_io_start(m);
+ page_unlock(pp);
+ } else {
VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0) {
- va = zfs_map_page(m, &sf);
- error = dmu_read(os, zp->z_id, start, bytes, va,
- DMU_READ_PREFETCH);
- if (bytes != PAGE_SIZE)
- bzero(va + bytes, PAGE_SIZE - bytes);
- zfs_unmap_page(sf);
- }
+ error = dmu_read_uio(os, zp->z_id, uio, bytes);
VM_OBJECT_LOCK(obj);
- vm_page_io_finish(m);
- vm_page_lock(m);
- if (error == 0) {
- m->valid = VM_PAGE_BITS_ALL;
- vm_page_activate(m);
- } else
- vm_page_free(m);
- vm_page_unlock(m);
-
- if (error == 0) {
- uio->uio_resid -= bytes;
- uio->uio_offset += bytes;
- }
- } else {
- dirbytes += bytes;
}
len -= bytes;
off = 0;
@@ -549,8 +538,6 @@ again:
break;
}
VM_OBJECT_UNLOCK(obj);
- if (error == 0 && dirbytes > 0)
- error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
return (error);
}
@@ -584,12 +571,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ssize_t n, nbytes;
int error;
rl_t *rl;
+ xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
os = zfsvfs->z_os;
- if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (EACCES);
}
@@ -613,7 +601,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* Check for mandatory locks
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (MANDMODE(zp->z_mode)) {
if (error = chklock(vp, FREAD,
uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
ZFS_EXIT(zfsvfs);
@@ -624,8 +612,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* If we're in FRSYNC mode, sync out this znode before reading it.
*/
- if (ioflag & FRSYNC)
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
/*
* Lock the range against changes.
@@ -636,18 +624,54 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* If we are reading past end-of-file we can skip
* to the end; but we might still need to set atime.
*/
- if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ if (uio->uio_loffset >= zp->z_size) {
error = 0;
goto out;
}
- ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
- n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+ ASSERT(uio->uio_loffset < zp->z_size);
+ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+#ifdef sun
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ (void) dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(vp)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ (void) dmu_xuio_add(xuio,
+ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz), 0, blksz);
+ }
+ }
+ }
+#endif /* sun */
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+#ifdef __FreeBSD__
+ if (uio->uio_segflg == UIO_NOCOPY)
+ error = mappedread_sf(vp, nbytes, uio);
+ else
+#endif /* __FreeBSD__ */
if (vn_has_cached_data(vp))
error = mappedread(vp, nbytes, uio);
else
@@ -661,7 +685,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n -= nbytes;
}
-
out:
zfs_range_unlock(rl);
@@ -671,53 +694,6 @@ out:
}
/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
- struct iovec *iov;
- ulong_t cnt, incr;
- caddr_t p;
-
- if (uio->uio_segflg != UIO_USERSPACE)
- return;
-
- iov = uio->uio_iov;
-
- while (n) {
- cnt = MIN(iov->iov_len, n);
- if (cnt == 0) {
- /* empty iov entry */
- iov++;
- continue;
- }
- n -= cnt;
- /*
- * touch each page in this segment.
- */
- p = iov->iov_base;
- while (cnt) {
- if (fubyte(p) == -1)
- return;
- incr = MIN(cnt, PAGESIZE);
- p += incr;
- cnt -= incr;
- }
- /*
- * touch the last byte in case it straddles a page.
- */
- p--;
- if (fubyte(p) == -1)
- return;
- iov++;
- }
-}
-
-/*
* Write the bytes to a file.
*
* IN: vp - vnode of file to be written to.
@@ -735,6 +711,7 @@ zfs_prefault_write(ssize_t n, struct uio *uio)
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
*/
+
/* ARGSUSED */
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -751,9 +728,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ssize_t n, nbytes;
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
- uint64_t pflags;
int error;
arc_buf_t *abuf;
+ iovec_t *aiov;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
+ int count = 0;
+ sa_bulk_attr_t bulk[4];
+ uint64_t mtime[2], ctime[2];
/*
* Fasttrack empty write
@@ -768,13 +753,19 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
/*
* If immutable or not appending then return EPERM
*/
- pflags = zp->z_phys->zp_flags;
- if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_phys->zp_size))) {
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -782,44 +773,61 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
zilog = zfsvfs->z_log;
/*
+ * Validate file offset
+ */
+ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Check for mandatory locks before calling zfs_range_lock()
+ * in order to prevent a deadlock with locks set via fcntl().
+ */
+ if (MANDMODE((mode_t)zp->z_mode) &&
+ (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef sun
+ /*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
*/
- zfs_prefault_write(n, uio);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif /* sun */
/*
* If in append mode, set the io offset pointer to eof.
*/
if (ioflag & FAPPEND) {
/*
- * Range lock for a file append:
- * The value for the start of range will be determined by
- * zfs_range_lock() (to guarantee append semantics).
- * If this write will cause the block size to increase,
- * zfs_range_lock() will lock the entire file, so we must
- * later reduce the range after we grow the block size.
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+ woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
- /* overlocked, zp_size can't change */
- woff = uio->uio_loffset = zp->z_phys->zp_size;
- } else {
- woff = uio->uio_loffset = rl->r_off;
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
}
+ uio->uio_loffset = woff;
} else {
- woff = uio->uio_loffset;
/*
- * Validate file offset
- */
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * If we need to grow the block size then zfs_range_lock()
- * will lock a wider range than we request here.
- * Later after growing the block size we reduce the range.
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
}
@@ -833,16 +841,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
- (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- end_size = MAX(zp->z_phys->zp_size, woff + n);
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_size);
+
+ end_size = MAX(zp->z_size, woff + n);
/*
* Write the file in reasonable size chunks. Each chunk is written
@@ -852,31 +854,41 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
-
again:
- if (zfs_usergroup_overquota(zfsvfs,
- B_FALSE, zp->z_phys->zp_uid) ||
- zfs_usergroup_overquota(zfsvfs,
- B_TRUE, zp->z_phys->zp_gid)) {
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = EDQUOT;
break;
}
- /*
- * If dmu_assign_arcbuf() is expected to execute with minimum
- * overhead loan an arc buffer and copy user data to it before
- * we enter a txg. This avoids holding a txg forever while we
- * pagefault on a hanging NFS server mapping.
- */
- if (abuf == NULL && n >= max_blksz &&
- woff >= zp->z_phys->zp_size &&
+ if (xuio && abuf == NULL) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (abuf == NULL && n >= max_blksz &&
+ woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
size_t cbytes;
- abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
@@ -891,8 +903,9 @@ again:
* Start a transaction.
*/
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -931,22 +944,38 @@ again:
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
- if (woff + nbytes > zp->z_phys->zp_size)
+ if (woff + nbytes > zp->z_size)
vnode_pager_setsize(vp, woff + nbytes);
if (abuf == NULL) {
tx_bytes = uio->uio_resid;
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
- nbytes, tx);
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
- ASSERT(tx_bytes == max_blksz);
- dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+ woff, abuf, tx);
+ }
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
-
if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
zp->z_id, uio->uio_segflg, tx);
@@ -957,6 +986,8 @@ again:
* partial progress, update the znode and ZIL accordingly.
*/
if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
dmu_tx_commit(tx);
ASSERT(error != 0);
break;
@@ -974,29 +1005,41 @@ again:
* user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
- if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
(S_IXUSR >> 6))) != 0 &&
- (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(vp, cr,
- (zp->z_phys->zp_mode & S_ISUID) != 0 &&
- zp->z_phys->zp_uid == 0) != 0) {
- zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
}
mutex_exit(&zp->z_acl_lock);
- /*
- * Update time stamp. NOTE: This marks the bonus buffer as
- * dirty, so we don't have to do it again for zp_size.
- */
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
/*
* Update the file size (zp_size) if it has changed;
* account for possible concurrent updates.
*/
- while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
- (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
+ ASSERT(error == 0);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
@@ -1004,6 +1047,11 @@ again:
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
+
+#ifdef sun
+ if (!xuio && n > 0)
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif /* sun */
}
zfs_range_unlock(rl);
@@ -1017,31 +1065,36 @@ again:
return (error);
}
- if (ioflag & (FSYNC | FDSYNC))
- zil_commit(zilog, zp->z_last_itx, zp->z_id);
+ if (ioflag & (FSYNC | FDSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
}
void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
- vnode_t *vp = ZTOV(rl->r_zp);
- objset_t *os = rl->r_zp->z_zfsvfs->z_os;
+ znode_t *zp = zgd->zgd_private;
+ objset_t *os = zp->z_zfsvfs->z_os;
int vfslocked;
- vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_range_unlock(zgd->zgd_rl);
+
+ vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
- VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
kmem_free(zgd, sizeof (zgd_t));
VFS_UNLOCK_GIANT(vfslocked);
}
@@ -1059,20 +1112,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
- uint64_t off = lr->lr_offset;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- int dlen = lr->lr_length; /* length of user data */
int error = 0;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
/*
* Nothing to do if the file has been removed
*/
- if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
return (ENOENT);
if (zp->z_unlinked) {
/*
@@ -1084,6 +1138,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
return (ENOENT);
}
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_private = zp;
+
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
@@ -1092,17 +1150,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- rl = zfs_range_lock(zp, off, dlen, RL_READER);
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (offset >= zp->z_size) {
error = ENOENT;
- goto out;
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
}
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
- DMU_READ_NO_PREFETCH));
+ ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
- uint64_t boff; /* block starting offset */
-
/*
* Have to lock the whole block to ensure when it's
* written out and it's checksum is being calculated
@@ -1110,80 +1167,59 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
- if (ISP2(zp->z_blksz)) {
- boff = P2ALIGN_TYPED(off, zp->z_blksz,
- uint64_t);
- } else {
- boff = 0;
- }
- dlen = zp->z_blksz;
- rl = zfs_range_lock(zp, boff, dlen, RL_READER);
- if (zp->z_blksz == dlen)
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+ RL_READER);
+ if (zp->z_blksz == size)
break;
- zfs_range_unlock(rl);
+ offset += blkoff;
+ zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (lr->lr_offset >= zp->z_size)
error = ENOENT;
- goto out;
- }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_rl = rl;
- zgd->zgd_zilog = zfsvfs->z_log;
- zgd->zgd_bp = &lr->lr_blkptr;
#ifdef DEBUG
if (zil_fault_io) {
error = EIO;
zil_fault_io = 0;
- } else {
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
}
-#else
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
#endif
- if (error != 0) {
- kmem_free(zgd, sizeof (zgd_t));
- goto out;
- }
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
- ASSERT(boff == db->db_offset);
- lr->lr_blkoff = off - boff;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zfs_get_done, zgd);
- ASSERT((error && error != EINPROGRESS) ||
- lr->lr_length <= zp->z_blksz);
if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= zp->z_blksz);
+
/*
- * dmu_sync() can compress a block of zeros to a null
- * blkptr but the block size still needs to be passed
- * through to replay.
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
*/
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
- }
+ if (error == 0)
+ return (0);
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zfs_get_done() callback.
- */
- if (error == EINPROGRESS) {
- return (0);
- } else if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- error = 0;
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ error = 0;
+ }
}
- dmu_buf_rele(db, zgd);
- kmem_free(zgd, sizeof (zgd_t));
}
-out:
- zfs_range_unlock(rl);
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ zfs_get_done(zgd, error);
+
return (error);
}
@@ -1267,7 +1303,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
if (dvp->v_type != VDIR) {
return (ENOTDIR);
- } else if (zdp->z_dbuf == NULL) {
+ } else if (zdp->z_sa_hdl == NULL) {
return (EIO);
}
@@ -1321,7 +1357,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
* We don't allow recursive attributes..
* Maybe someday we will.
*/
- if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ if (zdp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1394,7 +1430,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
VOP_UNLOCK(dvp, 0);
}
ZFS_EXIT(zfsvfs);
- error = vn_lock(*vpp, cnp->cn_lkflags);
+ error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
if (cnp->cn_flags & ISDOTDOT)
vn_lock(dvp, ltype | LK_RETRY);
if (error != 0) {
@@ -1466,8 +1502,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
void *vsecp = NULL;
int flag = 0;
@@ -1481,9 +1518,10 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
+
if (zfsvfs->z_use_fuids == B_FALSE &&
(vsecp || (vap->va_mask & AT_XVATTR) ||
- IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (EINVAL);
ZFS_ENTER(zfsvfs);
@@ -1528,12 +1566,15 @@ top:
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
NULL, NULL);
if (error) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
if (strcmp(name, "..") == 0)
error = EISDIR;
ZFS_EXIT(zfsvfs);
return (error);
}
}
+
if (zp == NULL) {
uint64_t txtype;
@@ -1542,6 +1583,8 @@ top:
* to reference it.
*/
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
goto out;
}
@@ -1549,16 +1592,20 @@ top:
* We only support the creation of regular files in
* extended attribute directories.
*/
- if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
error = EINVAL;
goto out;
}
-
- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
- &acl_ids)) != 0)
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
goto out;
+ have_acl = B_TRUE;
+
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
error = EDQUOT;
@@ -1566,36 +1613,39 @@ top:
}
tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ 0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dl, zp, tx, ZNEW);
-
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE)
txtype |= TX_CI;
@@ -1606,6 +1656,10 @@ top:
} else {
int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ have_acl = B_FALSE;
+
/*
* A directory entry already exists for this name.
*/
@@ -1660,6 +1714,9 @@ out:
error = specvp_check(vpp, cr);
}
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -1680,17 +1737,22 @@ out:
* dvp - ctime|mtime
* vp - ctime (if nlink > 0)
*/
+
+uint64_t null_xattr = 0;
+
/*ARGSUSED*/
static int
zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
int flags)
{
znode_t *zp, *dzp = VTOZ(dvp);
- znode_t *xzp = NULL;
+ znode_t *xzp;
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
+ uint64_t xattr_obj_unlinked = 0;
+ uint64_t obj = 0;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
boolean_t may_delete_now, delete_now = FALSE;
@@ -1712,6 +1774,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
}
top:
+ xattr_obj = 0;
+ xzp = NULL;
/*
* Attempt to lock directory; fail if entry doesn't exist.
*/
@@ -1744,7 +1808,9 @@ top:
else
dnlc_remove(dvp, name);
- may_delete_now = FALSE;
+ VI_LOCK(vp);
+ may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
+ VI_UNLOCK(vp);
/*
* We may delete the znode now, or we may put it in the unlinked set;
@@ -1752,27 +1818,34 @@ top:
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
+ obj = zp->z_id;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
if (may_delete_now) {
toobig =
- zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+ zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
/* if the file is too big, only hold_free a token amount */
dmu_tx_hold_free(tx, zp->z_id, 0,
(toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
}
/* are there any extended attributes? */
- if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
- /* XXX - do we need this if we are deleting? */
- dmu_tx_hold_bonus(tx, xattr_obj);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
- /* are there any additional acls */
- if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
- may_delete_now)
+ mutex_enter(&zp->z_lock);
+ if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ mutex_exit(&zp->z_lock);
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
@@ -1781,6 +1854,8 @@ top:
if (error) {
zfs_dirent_unlock(dl);
VN_RELE(vp);
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
@@ -1803,29 +1878,45 @@ top:
goto out;
}
- if (0 && unlinked) {
+ if (unlinked) {
+
+ /*
+ * Hold z_lock so that we can make sure that the ACL obj
+ * hasn't changed. Could have been deleted due to
+ * zfs_sa_upgrade().
+ */
+ mutex_enter(&zp->z_lock);
VI_LOCK(vp);
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
delete_now = may_delete_now && !toobig &&
vp->v_count == 1 && !vn_has_cached_data(vp) &&
- zp->z_phys->zp_xattr == xattr_obj &&
- zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+ xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
+ acl_obj;
VI_UNLOCK(vp);
}
if (delete_now) {
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT3U(error, ==, 0);
- ASSERT3U(xzp->z_phys->zp_links, ==, 2);
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ if (xattr_obj_unlinked) {
+ ASSERT3U(xzp->z_links, ==, 2);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = 1;
- xzp->z_phys->zp_links = 0;
+ xzp->z_links = 0;
+ error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx);
+ ASSERT3U(error, ==, 0);
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
- zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+
+ if (zp->z_is_sa)
+ error = sa_remove(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), tx);
+ else
+ error = sa_update(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), &null_xattr,
+ sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
}
- mutex_enter(&zp->z_lock);
VI_LOCK(vp);
vp->v_count--;
ASSERT3U(vp->v_count, ==, 0);
@@ -1833,13 +1924,14 @@ top:
mutex_exit(&zp->z_lock);
zfs_znode_delete(zp, tx);
} else if (unlinked) {
+ mutex_exit(&zp->z_lock);
zfs_unlinked_add(zp, tx);
}
txtype = TX_REMOVE;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
@@ -1848,12 +1940,13 @@ out:
zfs_dirent_unlock(dl);
- if (!delete_now) {
+ if (!delete_now)
VN_RELE(vp);
- } else if (xzp) {
- /* this rele is delayed to prevent nesting transactions */
+ if (xzp)
VN_RELE(ZTOV(xzp));
- }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1895,7 +1988,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
@@ -1911,15 +2004,15 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
- IS_EPHEMERAL(crgetgid(cr))))
+ (vsecp || (vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (EINVAL);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
- if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ if (dzp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1932,37 +2025,43 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
if (flags & FIGNORECASE)
zf |= ZCILOOK;
- if (vap->va_mask & AT_XVATTR)
+ if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
+ }
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ vsecp, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
/*
* First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
*/
top:
*vpp = NULL;
if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
NULL, NULL)) {
+ zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
- &acl_ids)) != 0) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
@@ -1979,18 +2078,23 @@ top:
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1999,10 +2103,11 @@ top:
/*
* Create new node.
*/
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
+
/*
* Now put new name in parent dir.
*/
@@ -2017,10 +2122,14 @@ top:
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
+
dmu_tx_commit(tx);
zfs_dirent_unlock(dl);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (0);
}
@@ -2108,8 +2217,10 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
rw_exit(&zp->z_parent_lock);
@@ -2136,7 +2247,7 @@ top:
uint64_t txtype = TX_RMDIR;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
@@ -2151,6 +2262,9 @@ out:
VN_RELE(vp);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -2197,6 +2311,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
zap_attribute_t zap;
uint_t bytes_wanted;
uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
int local_eof;
int outcount;
int error;
@@ -2210,6 +2325,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
/*
* If we are not given an eof variable,
* use a local one.
@@ -2273,8 +2394,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
* Minimum entry size is dirent size and 1 byte for a file name.
*/
ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
- cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
- *cookies = cooks;
+ *cookies = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
*ncookies = ncooks;
}
/*
@@ -2298,7 +2418,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
while (outcount < bytes_wanted) {
ino64_t objnum;
ushort_t reclen;
- off64_t *next;
+ off64_t *next = NULL;
/*
* Special case `.', `..', and `.zfs'.
@@ -2311,7 +2431,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
} else if (offset == 1) {
(void) strcpy(zap.za_name, "..");
zap.za_normalization_conflict = 0;
- objnum = zp->z_phys->zp_parent;
+ objnum = parent;
type = DT_DIR;
} else if (offset == 2 && zfs_show_ctldir(zp)) {
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
@@ -2421,6 +2541,16 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
if (prefetch)
dmu_prefetch(os, objnum, 0, 0);
+ if (ncookies != NULL) {
+ if (cooks == NULL)
+ cooks = *cookies;
+ else {
+ *cooks++ = offset;
+ ncooks--;
+ KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+ }
+ }
+
skip_entry:
/*
* Move to the next entry, fill in the previous offset.
@@ -2431,12 +2561,6 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
} else {
offset += 1;
}
-
- if (cooks != NULL) {
- *cooks++ = offset;
- ncooks--;
- KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
- }
}
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
@@ -2485,10 +2609,12 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
- ZFS_EXIT(zfsvfs);
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
return (0);
}
@@ -2515,26 +2641,38 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp;
int error = 0;
uint32_t blksize;
u_longlong_t nblocks;
uint64_t links;
+ uint64_t mtime[2], ctime[2], crtime[2];
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
- if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
- (pzp->zp_uid != crgetuid(cr))) {
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
@@ -2548,19 +2686,18 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
*/
mutex_enter(&zp->z_lock);
- vap->va_type = IFTOVT(pzp->zp_mode);
- vap->va_mode = pzp->zp_mode & ~S_IFMT;
- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+ vap->va_type = IFTOVT(zp->z_mode);
+ vap->va_mode = zp->z_mode & ~S_IFMT;
// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
vap->va_nodeid = zp->z_id;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
- links = pzp->zp_links + 1;
+ links = zp->z_links + 1;
else
- links = pzp->zp_links;
+ links = zp->z_links;
vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
- vap->va_size = pzp->zp_size;
+ vap->va_size = zp->z_size;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
- vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
+// vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
vap->va_seq = zp->z_seq;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
@@ -2571,110 +2708,114 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
xoap->xoa_archive =
- ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
xoap->xoa_readonly =
- ((pzp->zp_flags & ZFS_READONLY) != 0);
+ ((zp->z_pflags & ZFS_READONLY) != 0);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
xoap->xoa_system =
- ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
xoap->xoa_hidden =
- ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_nounlink =
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
xoap->xoa_immutable =
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_appendonly =
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
xoap->xoa_nodump =
- ((pzp->zp_flags & ZFS_NODUMP) != 0);
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
xoap->xoa_opaque =
- ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
xoap->xoa_av_quarantined =
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
xoap->xoa_av_modified =
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
- vp->v_type == VREG &&
- (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
- /*
- * Only VREG files have anti-virus scanstamps, so we
- * won't conflict with symlinks in the bonus buffer.
- */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len <= doi.doi_bonus_size) {
- /*
- * pzp points to the start of the
- * znode_phys_t. pzp + 1 points to the
- * first byte after the znode_phys_t.
- */
- (void) memcpy(xoap->xoa_av_scanstamp,
- pzp + 1,
- sizeof (xoap->xoa_av_scanstamp));
- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
- }
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
}
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
+ uint64_t times[2];
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ times, sizeof (times));
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
- ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
- ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
mutex_exit(&zp->z_lock);
- dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
vap->va_bytes = nblocks << 9; /* nblocks * 512 */
@@ -2713,7 +2854,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
- znode_phys_t *pzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
@@ -2725,15 +2865,19 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
- int err;
+ int err, err2;
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
- zfs_acl_t *aclp = NULL;
+ zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- boolean_t fuid_dirtied = B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
if (mask == 0)
return (0);
@@ -2744,7 +2888,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
zilog = zfsvfs->z_log;
/*
@@ -2781,14 +2924,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
/*
* Immutable files can only alter immutable bit and atime
*/
- if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
- if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+ if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -2809,6 +2952,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
top:
attrzp = NULL;
+ aclp = NULL;
/* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
@@ -2844,10 +2988,13 @@ top:
((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
- XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
skipaclchk, cr);
+ }
if (mask & (AT_UID|AT_GID)) {
int idmask = (mask & (AT_UID|AT_GID));
@@ -2860,7 +3007,7 @@ top:
*/
if (!(mask & AT_MODE))
- vap->va_mode = pzp->zp_mode;
+ vap->va_mode = zp->z_mode;
/*
* Take ownership or chgrp to group we are a member of
@@ -2898,7 +3045,7 @@ top:
}
mutex_enter(&zp->z_lock);
- oldva.va_mode = pzp->zp_mode;
+ oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
/*
@@ -2910,7 +3057,7 @@ top:
*/
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
if (xoap->xoa_appendonly !=
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
@@ -2920,7 +3067,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
if (xoap->xoa_nounlink !=
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
@@ -2930,7 +3077,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
@@ -2940,7 +3087,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
- ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
@@ -2950,7 +3097,7 @@ top:
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
@@ -2962,7 +3109,7 @@ top:
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
@@ -2970,6 +3117,12 @@ top:
}
}
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ mutex_exit(&zp->z_lock);
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
if (need_policy == FALSE &&
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
@@ -3038,79 +3191,89 @@ top:
*/
mask = vap->va_mask;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (mask & AT_MODE) {
- uint64_t pmode = pzp->zp_mode;
-
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
- goto out;
- if (pzp->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL from old V0 format to new V1 */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- pzp->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- }
- }
+ if ((mask & (AT_UID | AT_GID))) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
- if (mask & (AT_UID | AT_GID)) {
- if (pzp->zp_xattr) {
- err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err)
- goto out;
- dmu_tx_hold_bonus(tx, attrzp->z_id);
+ goto out2;
}
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
- if (new_uid != pzp->zp_uid &&
- zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (new_uid != zp->z_uid &&
+ zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
- goto out;
+ goto out2;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
- if (new_gid != pzp->zp_gid &&
- zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (new_gid != zp->z_gid &&
+ zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
- goto out;
+ goto out2;
}
}
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
- FALSE, NULL);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ zfs_acl_chmod_setattr(zp, &aclp, new_mode);
+
+ mutex_enter(&zp->z_lock);
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
} else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
}
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
}
+ mutex_exit(&zp->z_lock);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if ((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
}
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) {
if (err == ERESTART)
@@ -3118,8 +3281,7 @@ top:
goto out;
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
+ count = 0;
/*
* Set each attribute requested.
* We group settings according to the locks they need to acquire.
@@ -3128,47 +3290,108 @@ top:
* updated as a side-effect of calling this function.
*/
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
mutex_enter(&zp->z_lock);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ mutex_enter(&attrzp->z_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = new_uid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_uid = new_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ zp->z_gid = new_gid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = new_gid;
+ }
+ }
+ if (!(mask & AT_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
if (mask & AT_MODE) {
- mutex_enter(&zp->z_acl_lock);
- zp->z_phys->zp_mode = new_mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
+ ASSERT3U((uintptr_t)aclp, !=, 0);
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = aclp;
aclp = NULL;
- mutex_exit(&zp->z_acl_lock);
}
- if (attrzp)
- mutex_enter(&attrzp->z_lock);
- if (mask & AT_UID) {
- pzp->zp_uid = new_uid;
- if (attrzp)
- attrzp->z_phys->zp_uid = new_uid;
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
}
- if (mask & AT_GID) {
- pzp->zp_gid = new_gid;
- if (attrzp)
- attrzp->z_phys->zp_gid = new_gid;
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
}
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
-
- if (mask & AT_ATIME)
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
- if (mask & AT_MTIME)
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
- if (mask & AT_SIZE)
- zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
- else if (mask != 0)
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime, B_TRUE);
+ }
+ }
/*
* Do this after setting timestamps to prevent timestamp
* update from toggling bit
@@ -3200,20 +3423,10 @@ top:
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
ASSERT(vp->v_type == VREG);
- /* Grow the bonus buffer if necessary. */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len > doi.doi_bonus_size)
- VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
- }
- zfs_xvattr_set(zp, xvap);
+ zfs_xvattr_set(zp, xvap, tx);
}
if (fuid_dirtied)
@@ -3223,11 +3436,23 @@ top:
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
mutex_exit(&zp->z_lock);
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ mutex_exit(&attrzp->z_lock);
+ }
out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
if (attrzp)
VN_RELE(ZTOV(attrzp));
-
if (aclp)
zfs_acl_free(aclp);
@@ -3236,13 +3461,18 @@ out:
fuidp = NULL;
}
- if (err)
+ if (err) {
dmu_tx_abort(tx);
- else
+ if (err == ERESTART)
+ goto top;
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
+ }
- if (err == ERESTART)
- goto top;
+out2:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (err);
@@ -3283,7 +3513,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zfs_zlock_t *zl;
znode_t *zp = tdzp;
uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t *oidp = &zp->z_id;
+ uint64_t oidp = zp->z_id;
krwlock_t *rwlp = &szp->z_parent_lock;
krw_t rw = RW_WRITER;
@@ -3305,7 +3535,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zfs_rename_unlock(&zl);
*zlpp = NULL;
zp = tdzp;
- oidp = &zp->z_id;
+ oidp = zp->z_id;
rwlp = &szp->z_parent_lock;
rw = RW_WRITER;
continue;
@@ -3323,19 +3553,20 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
zl->zl_next = *zlpp;
*zlpp = zl;
- if (*oidp == szp->z_id) /* We're a descendant of szp */
+ if (oidp == szp->z_id) /* We're a descendant of szp */
return (EINVAL);
- if (*oidp == rootid) /* We've hit the top */
+ if (oidp == rootid) /* We've hit the top */
return (0);
if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
if (error)
return (error);
zl->zl_znode = zp;
}
- oidp = &zp->z_phys->zp_parent;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
+ &oidp, sizeof (oidp));
rwlp = &zp->z_parent_lock;
rw = RW_READER;
@@ -3415,8 +3646,7 @@ top:
* by renaming a linked file into/outof an attribute directory.
* See the comment in zfs_link() for why this is considered bad.
*/
- if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
- (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -3517,6 +3747,11 @@ top:
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
+ /*
+ * FreeBSD: In OpenSolaris they only check if rename source is
+ * ".." here, because "." is handled in their lookup. This is
+ * not the case for FreeBSD, so we check for "." explicitly.
+ */
if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
serr = EINVAL;
ZFS_EXIT(zfsvfs);
@@ -3596,14 +3831,20 @@ top:
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
- dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp)
- dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- if (tzp)
- dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
@@ -3634,17 +3875,39 @@ top:
if (error == 0) {
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
if (error == 0) {
- szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+ szp->z_pflags |= ZFS_AV_MODIFIED;
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- ASSERT(error == 0);
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
- zfs_log_rename(zilog, tx,
- TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
- sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME |
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
- /* Update path information for the target vnode */
- vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
+ /*
+ * Update path information for the target vnode
+ */
+ vn_renamepath(tdvp, ZTOV(szp), tnm,
+ strlen(tnm));
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
}
#ifdef FREEBSD_NAMECACHE
if (error == 0) {
@@ -3665,10 +3928,14 @@ out:
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
+
VN_RELE(ZTOV(szp));
if (tzp)
VN_RELE(ZTOV(tzp));
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
@@ -3701,11 +3968,12 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- int len = strlen(link);
+ uint64_t len = strlen(link);
int error;
int zflg = ZNEW;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
int flags = 0;
ASSERT(vap->va_type == VLNK);
@@ -3721,27 +3989,35 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
}
if (flags & FIGNORECASE)
zflg |= ZCILOOK;
-top:
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
return (ENAMETOOLONG);
}
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+top:
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
- VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
@@ -3751,71 +4027,59 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
/*
* Create a new object for the symlink.
- * Put the link content into bonus buffer if it will fit;
- * otherwise, store it just like any other file data.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
*/
- if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
- if (len != 0)
- bcopy(link, zp->z_phys + 1, len);
- } else {
- dmu_buf_t *dbp;
-
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
- /*
- * Nothing can access the znode yet so no locking needed
- * for growing the znode's blocksize.
- */
- zfs_grow_blocksize(zp, len, tx);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
- VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
- zp->z_id, 0, FTAG, &dbp));
- dmu_buf_will_dirty(dbp, tx);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
- ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp, FTAG);
- }
- zp->z_phys->zp_size = len;
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+ mutex_exit(&zp->z_lock);
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
/*
* Insert the new object into the directory.
*/
(void) zfs_link_create(dl, zp, tx, ZNEW);
- if (error == 0) {
- uint64_t txtype = TX_SYMLINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
- *vpp = ZTOV(zp);
- }
+
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ *vpp = ZTOV(zp);
zfs_acl_ids_free(&acl_ids);
@@ -3823,6 +4087,9 @@ top:
zfs_dirent_unlock(dl);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3850,29 +4117,21 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- size_t bufsz;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- bufsz = (size_t)zp->z_phys->zp_size;
- if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
- error = uiomove(zp->z_phys + 1,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- } else {
- dmu_buf_t *dbp;
- error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- error = uiomove(dbp->db_data,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp, FTAG);
- }
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+ mutex_exit(&zp->z_lock);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3938,7 +4197,12 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
/* Prevent links to .zfs/shares files */
- if (szp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -3957,16 +4221,14 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
* into "normal" file space in order to circumvent restrictions
* imposed in attribute space.
*/
- if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
- (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
- owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
- if (owner != crgetuid(cr) &&
- secpolicy_basic_link(svp, cr) != 0) {
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -3987,8 +4249,10 @@ top:
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
@@ -4019,9 +4283,250 @@ top:
vnevent_link(svp, ct);
}
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef sun
+/*
+ * zfs_null_putapage() is used when the file system has been force
+ * unmounted. It just drops the pages.
+ */
+/* ARGSUSED */
+static int
+zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
+ return (0);
+}
+
+/*
+ * Push a page out to disk, klustering if possible.
+ *
+ * IN: vp - file to push page to.
+ * pp - page to push.
+ * flags - additional flags.
+ * cr - credentials of caller.
+ *
+ * OUT: offp - start of range pushed.
+ * lenp - len of range pushed.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * NOTE: callers must have locked the page to be pushed. On
+ * exit, the page (and all other pages in the kluster) must be
+ * unlocked.
+ */
+/* ARGSUSED */
+static int
+zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_tx_t *tx;
+ u_offset_t off, koff;
+ size_t len, klen;
+ int err;
+
+ off = pp->p_offset;
+ len = PAGESIZE;
+ /*
+ * If our blocksize is bigger than the page size, try to kluster
+ * multiple pages so that we write a full block (thus avoiding
+ * a read-modify-write).
+ */
+ if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
+ klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+ koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
+ ASSERT(koff <= zp->z_size);
+ if (koff + klen > zp->z_size)
+ klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
+ pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
+ }
+ ASSERT3U(btop(len), ==, btopr(len));
+
+ /*
+ * Can't push pages past end-of-file.
+ */
+ if (off >= zp->z_size) {
+ /* ignore all pages */
+ err = 0;
+ goto out;
+ } else if (off + len > zp->z_size) {
+ int npages = btopr(zp->z_size - off);
+ page_t *trunc;
+
+ page_list_break(&pp, &trunc, npages);
+ /* ignore pages past end of file */
+ if (trunc)
+ pvn_write_done(trunc, flags);
+ len = zp->z_size - off;
+ }
+
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ err = EDQUOT;
+ goto out;
+ }
+top:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (err != 0) {
+ if (err == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz <= PAGESIZE) {
+ caddr_t va = zfs_map_page(pp, S_READ);
+ ASSERT3U(len, <=, PAGESIZE);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+ zfs_unmap_page(pp, va);
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
+ if (offp)
+ *offp = off;
+ if (lenp)
+ *lenp = len;
+
+ return (err);
+}
+
+/*
+ * Copy the portion of the file indicated from pages into the file.
+ * The pages are stored in a page list attached to the files vnode.
+ *
+ * IN: vp - vnode of file to push page data to.
+ * off - position in file to put data.
+ * len - amount of data to write.
+ * flags - flags to control the operation.
+ * cr - credentials of caller.
+ * ct - caller context.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t *pp;
+ size_t io_len;
+ u_offset_t io_off;
+ uint_t blksz;
+ rl_t *rl;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * Align this request to the file block size in case we kluster.
+ * XXX - this can result in pretty aggresive locking, which can
+ * impact simultanious read/write access. One option might be
+ * to break up long requests (len == 0) into block-by-block
+ * operations to get narrower locking.
+ */
+ blksz = zp->z_blksz;
+ if (ISP2(blksz))
+ io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
+ else
+ io_off = 0;
+ if (len > 0 && ISP2(blksz))
+ io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
+ else
+ io_len = 0;
+
+ if (io_len == 0) {
+ /*
+ * Search the entire vp list for pages >= io_off.
+ */
+ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
+ error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
+ goto out;
+ }
+ rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
+
+ if (off > zp->z_size) {
+ /* past end of file */
+ zfs_range_unlock(rl);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
+
+ for (off = io_off; io_off < off + len; io_off += io_len) {
+ if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
+ pp = page_lookup(vp, io_off,
+ (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
+ } else {
+ pp = page_lookup_nowait(vp, io_off,
+ (flags & B_FREE) ? SE_EXCL : SE_SHARED);
+ }
+
+ if (pp != NULL && pvn_getdirty(pp, flags)) {
+ int err;
+
+ /*
+ * Found a dirty page to push
+ */
+ err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
+ if (err)
+ error = err;
+ } else {
+ io_len = PAGESIZE;
+ }
+ }
+out:
+ zfs_range_unlock(rl);
+ if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
return (error);
}
+#endif /* sun */
/*ARGSUSED*/
void
@@ -4032,13 +4537,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
int error;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf == NULL) {
+ if (zp->z_sa_hdl == NULL) {
/*
* The fs has been unmounted, or we did a
* suspend/resume and this file no longer exists.
*/
VI_LOCK(vp);
- vp->v_count = 0; /* count arrives as 1 */
+ ASSERT(vp->v_count <= 1);
+ vp->v_count = 0;
VI_UNLOCK(vp);
vrecycle(vp, curthread);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
@@ -4048,13 +4554,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
@@ -4065,6 +4573,431 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
+#ifdef sun
+/*
+ * Bounds-check the seek operation.
+ *
+ * IN: vp - vnode seeking within
+ * ooff - old file offset
+ * noffp - pointer to new file offset
+ * ct - caller context
+ *
+ * RETURN: 0 if success
+ * EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+static int
+zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ if (vp->v_type == VDIR)
+ return (0);
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Pre-filter the generic locking function to trap attempts to place
+ * a mandatory lock on a memory mapped file.
+ */
+static int
+zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+ flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * We are following the UFS semantics with respect to mapcnt
+ * here: If we see that the file is mapped already, then we will
+ * return an error, but we don't worry about races between this
+ * function and zfs_map().
+ */
+ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+}
+
+/*
+ * If we can't find a page in the cache, we will create a new page
+ * and fill it with file data. For efficiency, we may try to fill
+ * multiple pages at once (klustering) to fill up the supplied page
+ * list. Note that the pages to be filled are held with an exclusive
+ * lock to prevent access by other threads while they are being filled.
+ */
+static int
+zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
+ caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+{
+ znode_t *zp = VTOZ(vp);
+ page_t *pp, *cur_pp;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ u_offset_t io_off, total;
+ size_t io_len;
+ int err;
+
+ if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
+ /*
+ * We only have a single page, don't bother klustering
+ */
+ io_off = off;
+ io_len = PAGESIZE;
+ pp = page_create_va(vp, io_off, io_len,
+ PG_EXCL | PG_WAIT, seg, addr);
+ } else {
+ /*
+ * Try to find enough pages to fill the page list
+ */
+ pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
+ &io_len, off, plsz, 0);
+ }
+ if (pp == NULL) {
+ /*
+ * The page already exists, nothing to do here.
+ */
+ *pl = NULL;
+ return (0);
+ }
+
+ /*
+ * Fill the pages in the kluster.
+ */
+ cur_pp = pp;
+ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ caddr_t va;
+
+ ASSERT3U(io_off, ==, cur_pp->p_offset);
+ va = zfs_map_page(cur_pp, S_WRITE);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+ DMU_READ_PREFETCH);
+ zfs_unmap_page(cur_pp, va);
+ if (err) {
+ /* On error, toss the entire kluster */
+ pvn_read_done(pp, B_ERROR);
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
+ return (err);
+ }
+ cur_pp = cur_pp->p_next;
+ }
+
+ /*
+ * Fill in the page list array from the kluster starting
+ * from the desired offset `off'.
+ * NOTE: the page list will always be null terminated.
+ */
+ pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+ ASSERT(pl == NULL || (*pl)->p_offset == off);
+
+ return (0);
+}
+
+/*
+ * Return pointers to the pages for the file region [off, off + len]
+ * in the pl array. If plsz is greater than len, this function may
+ * also return page pointers from after the specified region
+ * (i.e. the region [off, off + plsz]). These additional pages are
+ * only returned if they are already in the cache, or were created as
+ * part of a klustered read.
+ *
+ * IN: vp - vnode of file to get data from.
+ * off - position in file to get data from.
+ * len - amount of data to retrieve.
+ * plsz - length of provided page list.
+ * seg - segment to obtain pages for.
+ * addr - virtual address of fault.
+ * rw - mode of created pages.
+ * cr - credentials of caller.
+ * ct - caller context.
+ *
+ * OUT: protp - protection mode of created pages.
+ * pl - list of pages created.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+ enum seg_rw rw, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t **pl0 = pl;
+ int err = 0;
+
+ /* we do our own caching, faultahead is unnecessary */
+ if (pl == NULL)
+ return (0);
+ else if (len > plsz)
+ len = plsz;
+ else
+ len = P2ROUNDUP(len, PAGESIZE);
+ ASSERT(plsz >= len);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (protp)
+ *protp = PROT_ALL;
+
+ /*
+ * Loop through the requested range [off, off + len) looking
+ * for pages. If we don't find a page, we will need to create
+ * a new page and fill it with data from the file.
+ */
+ while (len > 0) {
+ if (*pl = page_lookup(vp, off, SE_SHARED))
+ *(pl+1) = NULL;
+ else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
+ goto out;
+ while (*pl) {
+ ASSERT3U((*pl)->p_offset, ==, off);
+ off += PAGESIZE;
+ addr += PAGESIZE;
+ if (len > 0) {
+ ASSERT3U(len, >=, PAGESIZE);
+ len -= PAGESIZE;
+ }
+ ASSERT3U(plsz, >=, PAGESIZE);
+ plsz -= PAGESIZE;
+ pl++;
+ }
+ }
+
+ /*
+ * Fill out the page array with any pages already in the cache.
+ */
+ while (plsz > 0 &&
+ (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
+ off += PAGESIZE;
+ plsz -= PAGESIZE;
+ }
+out:
+ if (err) {
+ /*
+ * Release any pages we have previously locked.
+ */
+ while (pl > pl0)
+ page_unlock(*--pl);
+ } else {
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ }
+
+ *pl = NULL;
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Request a memory map for a section of a file. This code interacts
+ * with common code and the VM system as follows:
+ *
+ * common code calls mmap(), which ends up in smmap_common()
+ *
+ * this calls VOP_MAP(), which takes you into (say) zfs
+ *
+ * zfs_map() calls as_map(), passing segvn_create() as the callback
+ *
+ * segvn_create() creates the new segment and calls VOP_ADDMAP()
+ *
+ * zfs_addmap() updates z_mapcnt
+ */
+/*ARGSUSED*/
+static int
+zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ segvn_crargs_t vn_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((prot & PROT_WRITE) && (zp->z_pflags &
+ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if ((prot & (PROT_READ | PROT_EXEC)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+ ZFS_EXIT(zfsvfs);
+ return (EACCES);
+ }
+
+ if (vp->v_flag & VNOMAP) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOSYS);
+ }
+
+ if (off < 0 || len > MAXOFFSET_T - off) {
+ ZFS_EXIT(zfsvfs);
+ return (ENXIO);
+ }
+
+ if (vp->v_type != VREG) {
+ ZFS_EXIT(zfsvfs);
+ return (ENODEV);
+ }
+
+ /*
+ * If file is locked, disallow mapping.
+ */
+ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+
+ as_rangelock(as);
+ error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (error != 0) {
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vn_a.vp = vp;
+ vn_a.offset = (u_offset_t)off;
+ vn_a.type = flags & MAP_TYPE;
+ vn_a.prot = prot;
+ vn_a.maxprot = maxprot;
+ vn_a.cred = cr;
+ vn_a.amp = NULL;
+ vn_a.flags = flags & ~MAP_TYPE;
+ vn_a.szc = 0;
+ vn_a.lgrp_mem_policy_flags = 0;
+
+ error = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ uint64_t pages = btopr(len);
+
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
+ return (0);
+}
+
+/*
+ * The reason we push dirty pages as part of zfs_delmap() is so that we get a
+ * more accurate mtime for the associated file. Since we don't have a way of
+ * detecting when the data was actually modified, we have to resort to
+ * heuristics. If an explicit msync() is done, then we mark the mtime when the
+ * last page is pushed. The problem occurs when the msync() call is omitted,
+ * which by far the most common case:
+ *
+ * open()
+ * mmap()
+ * <modify memory>
+ * munmap()
+ * close()
+ * <time lapse>
+ * putpage() via fsflush
+ *
+ * If we wait until fsflush to come along, we can have a modification time that
+ * is some arbitrary point in the future. In order to prevent this in the
+ * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
+ * torn down.
+ */
+/* ARGSUSED */
+static int
+zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ uint64_t pages = btopr(len);
+
+ ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
+
+ if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
+ vn_has_cached_data(vp))
+ (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
+
+ return (0);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: vp - vnode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller [UNUSED].
+ * ct - caller context.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ */
+/* ARGSUSED */
+static int
+zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = convoff(vp, bfp, 0, offset)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* sun */
+
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -4075,13 +5008,21 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint32_t gen;
+ uint64_t gen64;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
- int size, i;
+ int size, i, error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- gen = (uint32_t)zp->z_gen;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
fidp->fid_len = size;
@@ -4134,8 +5075,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
case _PC_FILESIZEBITS:
*valp = 64;
return (0);
-
-#if 0
+#ifdef sun
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
@@ -4158,8 +5098,31 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
}
ZFS_EXIT(zfsvfs);
return (error);
-#endif
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_type == VREG || vp->v_type == VDIR);
+ return (0);
+
+ case _PC_ACCESS_FILTERING:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
+ vp->v_type == VDIR;
+ return (0);
+
+ case _PC_ACL_ENABLED:
+ *valp = _ACL_ACE_ENABLED;
+ return (0);
+#endif /* sun */
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+#ifdef sun
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ return (0);
+#endif /* sun */
case _PC_ACL_EXTENDED:
*valp = 0;
return (0);
@@ -4172,10 +5135,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
*valp = ACL_MAX_ENTRIES;
return (0);
- case _PC_MIN_HOLE_SIZE:
- *valp = (int)SPA_MINBLOCKSIZE;
- return (0);
-
default:
return (EOPNOTSUPP);
}
@@ -4208,14 +5167,350 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
ZFS_EXIT(zfsvfs);
return (error);
}
+#ifdef sun
+/*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ * an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int max_blksz = zfsvfs->z_max_blksz;
+ uio_t *uio = &xuio->xu_uio;
+ ssize_t size = uio->uio_resid;
+ offset_t offset = uio->uio_loffset;
+ int blksz;
+ int fullblk, i;
+ arc_buf_t *abuf;
+ ssize_t maxsize;
+ int preamble, postamble;
+
+ if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ switch (ioflag) {
+ case UIO_WRITE:
+ /*
+ * Loan out an arc_buf for write if write size is bigger than
+ * max_blksz, and the file's block size is also max_blksz.
+ */
+ blksz = max_blksz;
+ if (size < blksz || zp->z_blksz != blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ /*
+ * Caller requests buffers for write before knowing where the
+ * write offset might be (e.g. NFS TCP write).
+ */
+ if (offset == -1) {
+ preamble = 0;
+ } else {
+ preamble = P2PHASE(offset, blksz);
+ if (preamble) {
+ preamble = blksz - preamble;
+ size -= preamble;
+ }
+ }
+
+ postamble = P2PHASE(size, blksz);
+ size -= postamble;
+
+ fullblk = size / blksz;
+ (void) dmu_xuio_init(xuio,
+ (preamble != 0) + fullblk + (postamble != 0));
+ DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+ int, postamble, int,
+ (preamble != 0) + fullblk + (postamble != 0));
+
+ /*
+ * Have to fix iov base/len for partial buffers. They
+ * currently represent full arc_buf's.
+ */
+ if (preamble) {
+ /* data begins in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf,
+ blksz - preamble, preamble);
+ }
+
+ for (i = 0; i < fullblk; i++) {
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, blksz);
+ }
+
+ if (postamble) {
+ /* data ends in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, postamble);
+ }
+ break;
+ case UIO_READ:
+ /*
+ * Loan out an arc_buf for read if the read size is larger than
+ * the current file block size. Block alignment is not
+ * considered. Partial arc_buf will be loaned out for read.
+ */
+ blksz = zp->z_blksz;
+ if (blksz < zcr_blksz_min)
+ blksz = zcr_blksz_min;
+ if (blksz > zcr_blksz_max)
+ blksz = zcr_blksz_max;
+ /* avoid potential complexity of dealing with it */
+ if (blksz > max_blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ maxsize = zp->z_size - uio->uio_loffset;
+ if (size > maxsize)
+ size = maxsize;
+
+ if (size < blksz || vn_has_cached_data(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ break;
+ default:
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ uio->uio_extflg = UIO_XUIO;
+ XUIO_XUZC_RW(xuio) = ioflag;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+ int i;
+ arc_buf_t *abuf;
+ int ioflag = XUIO_XUZC_RW(xuio);
+
+ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+ i = dmu_xuio_cnt(xuio);
+ while (i-- > 0) {
+ abuf = dmu_xuio_arcbuf(xuio, i);
+ /*
+ * if abuf == NULL, it must be a write buffer
+ * that has been returned in zfs_write().
+ */
+ if (abuf)
+ dmu_return_arcbuf(abuf);
+ ASSERT(abuf || ioflag == UIO_WRITE);
+ }
+
+ dmu_xuio_fini(xuio);
+ return (0);
+}
+
+/*
+ * Predeclare these here so that the compiler assumes that
+ * this is an "old style" function declaration that does
+ * not include arguments => we won't get type mismatch errors
+ * in the initializations that follow.
+ */
+static int zfs_inval();
+static int zfs_isdir();
+
+static int
+zfs_inval()
+{
+ return (EINVAL);
+}
+
+static int
+zfs_isdir()
+{
+ return (EISDIR);
+}
+/*
+ * Directory vnode operations template
+ */
+vnodeops_t *zfs_dvnodeops;
+const fs_operation_def_t zfs_dvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_READ, { .error = zfs_isdir },
+ VOPNAME_WRITE, { .error = zfs_isdir },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_CREATE, { .vop_create = zfs_create },
+ VOPNAME_REMOVE, { .vop_remove = zfs_remove },
+ VOPNAME_LINK, { .vop_link = zfs_link },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
+ VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
+ VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Regular file vnode operations template
+ */
+vnodeops_t *zfs_fvnodeops;
+const fs_operation_def_t zfs_fvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_READ, { .vop_read = zfs_read },
+ VOPNAME_WRITE, { .vop_write = zfs_write },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
+ VOPNAME_SPACE, { .vop_space = zfs_space },
+ VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
+ VOPNAME_MAP, { .vop_map = zfs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
+ VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
+ NULL, NULL
+};
+
+/*
+ * Symbolic link vnode operations template
+ */
+vnodeops_t *zfs_symvnodeops;
+const fs_operation_def_t zfs_symvnodeops_template[] = {
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * special share hidden files vnode operations template
+ */
+vnodeops_t *zfs_sharevnodeops;
+const fs_operation_def_t zfs_sharevnodeops_template[] = {
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Extended attribute directory vnode operations template
+ * This template is identical to the directory vnodes
+ * operation template except for restricted operations:
+ * VOP_MKDIR()
+ * VOP_SYMLINK()
+ * Note that there are other restrictions embedded in:
+ * zfs_create() - restrict type to VREG
+ * zfs_link() - no links into/out of attribute space
+ * zfs_rename() - no moves into/out of attribute space
+ */
+vnodeops_t *zfs_xdvnodeops;
+const fs_operation_def_t zfs_xdvnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = zfs_open },
+ VOPNAME_CLOSE, { .vop_close = zfs_close },
+ VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
+ VOPNAME_CREATE, { .vop_create = zfs_create },
+ VOPNAME_REMOVE, { .vop_remove = zfs_remove },
+ VOPNAME_LINK, { .vop_link = zfs_link },
+ VOPNAME_RENAME, { .vop_rename = zfs_rename },
+ VOPNAME_MKDIR, { .error = zfs_inval },
+ VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
+ VOPNAME_SYMLINK, { .error = zfs_inval },
+ VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_SEEK, { .vop_seek = zfs_seek },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
+ * Error vnode operations template
+ */
+vnodeops_t *zfs_evnodeops;
+const fs_operation_def_t zfs_evnodeops_template[] = {
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ NULL, NULL
+};
+#endif /* sun */
+
static int
ioflags(int ioflags)
{
@@ -4286,14 +5581,12 @@ zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex);
VM_OBJECT_UNLOCK(object);
-
va = zfs_map_page(mreq, &sf);
error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex),
size, va, DMU_READ_PREFETCH);
if (size != PAGE_SIZE)
bzero(va + size, PAGE_SIZE - size);
zfs_unmap_page(sf);
-
VM_OBJECT_LOCK(object);
if (!error)
@@ -4336,7 +5629,7 @@ zfs_freebsd_open(ap)
error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
if (error == 0)
- vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
+ vnode_create_vobject(vp, zp->z_size, ap->a_td);
return (error);
}
@@ -4411,7 +5704,6 @@ zfs_freebsd_access(ap)
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
- znode_phys_t *zphys = zp->z_phys;
accmode_t accmode;
int error = 0;
@@ -4428,9 +5720,8 @@ zfs_freebsd_access(ap)
if (error == 0) {
accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0) {
- error = vaccess(vp->v_type, zphys->zp_mode,
- zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred,
- NULL);
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred, NULL);
}
}
@@ -4439,8 +5730,9 @@ zfs_freebsd_access(ap)
* non-directories.
*/
if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
- (zphys->zp_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
+ (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
error = EACCES;
+ }
return (error);
}
@@ -4569,7 +5861,6 @@ zfs_freebsd_getattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
@@ -4616,7 +5907,6 @@ zfs_freebsd_setattr(ap)
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
- struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
@@ -4632,7 +5922,7 @@ zfs_freebsd_setattr(ap)
xva_init(&xvap);
xvap.xva_vattr = *vap;
- zflags = VTOZ(vp)->z_phys->zp_flags;
+ zflags = VTOZ(vp)->z_pflags;
if (vap->va_flags != VNOVAL) {
zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
@@ -4805,7 +6095,7 @@ zfs_reclaim_complete(void *arg, int pending)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf != NULL) {
+ if (zp->z_sa_hdl != NULL) {
ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
@@ -4830,8 +6120,9 @@ zfs_freebsd_reclaim(ap)
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ boolean_t rlocked;
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
ASSERT(zp != NULL);
@@ -4841,15 +6132,17 @@ zfs_freebsd_reclaim(ap)
vnode_destroy_vobject(vp);
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_phys != NULL);
zp->z_vnode = NULL;
mutex_exit(&zp->z_lock);
- if (zp->z_unlinked)
+ if (zp->z_unlinked) {
; /* Do nothing. */
- else if (zp->z_dbuf == NULL)
+ } else if (!rlocked) {
+ TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
+ taskqueue_enqueue(taskqueue_thread, &zp->z_task);
+ } else if (zp->z_sa_hdl == NULL) {
zfs_znode_free(zp);
- else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
+ } else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
int locked;
locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
@@ -4872,7 +6165,8 @@ zfs_freebsd_reclaim(ap)
vp->v_data = NULL;
ASSERT(vp->v_holdcnt >= 1);
VI_UNLOCK(vp);
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ if (rlocked)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index dbee467dbc20..1f3d0b2a31f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -47,6 +46,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
#include <sys/fs/zfs.h>
#include <sys/kidmap.h>
#endif /* _KERNEL */
@@ -56,9 +56,13 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#include <sys/refcount.h>
#include "zfs_prop.h"
+#include "zfs_comutil.h"
/* Used by fstat(1). */
SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
@@ -78,9 +82,6 @@ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
#define ZNODE_STAT_ADD(stat) /* nothing */
#endif /* ZNODE_STATS */
-#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
-#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
/*
* Functions needed for userland (ie: libzpool) are not put under
* #ifdef_KERNEL; the rest of the functions have dependencies
@@ -99,35 +100,11 @@ static kmem_cache_t *znode_cache = NULL;
static void
znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
{
-#if 1 /* XXXPJD: From OpenSolaris. */
/*
* We should never drop all dbuf refs without first clearing
* the eviction callback.
*/
panic("evicting znode %p\n", user_ptr);
-#else /* XXXPJD */
- znode_t *zp = user_ptr;
- vnode_t *vp;
-
- mutex_enter(&zp->z_lock);
- zp->z_dbuf = NULL;
- vp = ZTOV(zp);
- if (vp == NULL) {
- mutex_exit(&zp->z_lock);
- zfs_znode_free(zp);
- } else if (vp->v_count == 0) {
- zp->z_vnode = NULL;
- vhold(vp);
- mutex_exit(&zp->z_lock);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
- vrecycle(vp, curthread);
- VOP_UNLOCK(vp, 0);
- vdrop(vp);
- zfs_znode_free(zp);
- } else {
- mutex_exit(&zp->z_lock);
- }
-#endif
}
extern struct vop_vector zfs_vnodeops;
@@ -140,6 +117,7 @@ extern struct vop_vector zfs_shareops;
* to pass vfsp here, which is not possible, because argument
* 'cdrarg' is defined at kmem_cache_create() time.
*/
+/*ARGSUSED*/
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
{
@@ -160,6 +138,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_vnode = vp;
vp->v_data = (caddr_t)zp;
VN_LOCK_AREC(vp);
+ VN_LOCK_ASHARE(vp);
} else {
zp->z_vnode = NULL;
}
@@ -175,9 +154,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
- zp->z_dbuf = NULL;
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
+ zp->z_moved = 0;
return (0);
}
@@ -198,7 +177,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
}
@@ -215,7 +193,7 @@ static struct {
} znode_move_stats;
#endif /* ZNODE_STATS */
-#if defined(sun)
+#ifdef sun
static void
zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
{
@@ -240,11 +218,17 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
nzp->z_blksz = ozp->z_blksz;
nzp->z_seq = ozp->z_seq;
nzp->z_mapcnt = ozp->z_mapcnt;
- nzp->z_last_itx = ozp->z_last_itx;
nzp->z_gen = ozp->z_gen;
nzp->z_sync_cnt = ozp->z_sync_cnt;
- nzp->z_phys = ozp->z_phys;
- nzp->z_dbuf = ozp->z_dbuf;
+ nzp->z_is_sa = ozp->z_is_sa;
+ nzp->z_sa_hdl = ozp->z_sa_hdl;
+ bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+ nzp->z_links = ozp->z_links;
+ nzp->z_size = ozp->z_size;
+ nzp->z_pflags = ozp->z_pflags;
+ nzp->z_uid = ozp->z_uid;
+ nzp->z_gid = ozp->z_gid;
+ nzp->z_mode = ozp->z_mode;
/*
* Since this is just an idle znode and kmem is already dealing with
@@ -255,9 +239,7 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
ozp->z_acl_cached = NULL;
}
- /* Update back pointers. */
- (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
- znode_evict_error);
+ sa_set_userp(nzp->z_sa_hdl, nzp);
/*
* Invalidate the original znode by clearing fields that provide a
@@ -265,8 +247,14 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
* ensure that zfs_znode_move() recognizes the znode as invalid in any
* subsequent callback.
*/
- ozp->z_dbuf = NULL;
+ ozp->z_sa_hdl = NULL;
POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+ /*
+ * Mark the znode.
+ */
+ nzp->z_moved = 1;
+ ozp->z_moved = (uint8_t)-1;
}
/*ARGSUSED*/
@@ -389,14 +377,19 @@ zfs_znode_init(void)
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-#if defined(sun)
kmem_cache_set_move(znode_cache, zfs_znode_move);
-#endif
}
void
zfs_znode_fini(void)
{
+#ifdef sun
+ /*
+ * Cleanup vfs & vnode ops
+ */
+ zfs_remove_op_tables();
+#endif /* sun */
+
/*
* Cleanup zcache
*/
@@ -406,6 +399,100 @@ zfs_znode_fini(void)
rw_destroy(&zfsvfs_lock);
}
+#ifdef sun
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+struct vnodeops *zfs_sharevnodeops;
+
+void
+zfs_remove_op_tables()
+{
+ /*
+ * Remove vfs ops
+ */
+ ASSERT(zfsfstype);
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ zfsfstype = 0;
+
+ /*
+ * Remove vnode ops
+ */
+ if (zfs_dvnodeops)
+ vn_freevnodeops(zfs_dvnodeops);
+ if (zfs_fvnodeops)
+ vn_freevnodeops(zfs_fvnodeops);
+ if (zfs_symvnodeops)
+ vn_freevnodeops(zfs_symvnodeops);
+ if (zfs_xdvnodeops)
+ vn_freevnodeops(zfs_xdvnodeops);
+ if (zfs_evnodeops)
+ vn_freevnodeops(zfs_evnodeops);
+ if (zfs_sharevnodeops)
+ vn_freevnodeops(zfs_sharevnodeops);
+
+ zfs_dvnodeops = NULL;
+ zfs_fvnodeops = NULL;
+ zfs_symvnodeops = NULL;
+ zfs_xdvnodeops = NULL;
+ zfs_evnodeops = NULL;
+ zfs_sharevnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+extern const fs_operation_def_t zfs_sharevnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+ int error;
+
+ /*
+ * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+ * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+ * In this case we just return as the ops vectors are already set up.
+ */
+ if (zfs_dvnodeops)
+ return (0);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+ &zfs_dvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+ &zfs_fvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+ &zfs_symvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+ &zfs_xdvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+ &zfs_evnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
+ &zfs_sharevnodeops);
+
+ return (error);
+}
+#endif /* sun */
+
int
zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
@@ -424,9 +511,12 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
sharezp->z_vnode = &vnode;
vnode.v_data = sharezp;
@@ -436,8 +526,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
kcred, NULL, &acl_ids));
- zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
- &zp, 0, &acl_ids);
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, sharezp);
POINTER_INVALIDATE(&sharezp->z_zfsvfs);
error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
@@ -449,9 +538,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
ZTOV(sharezp)->v_count = 0;
ZTOV(sharezp)->v_holdcnt = 0;
zp->z_vnode = NULL;
+ sa_handle_destroy(sharezp->z_sa_hdl);
sharezp->z_vnode = NULL;
- dmu_buf_rele(sharezp->z_dbuf, NULL);
- sharezp->z_dbuf = NULL;
kmem_cache_free(znode_cache, sharezp);
return (error);
@@ -498,26 +586,25 @@ zfs_cmpldev(uint64_t dev)
}
static void
-zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
{
- znode_t *nzp;
-
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_dbuf == NULL);
+ ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL);
- zp->z_dbuf = db;
- nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
- /*
- * there should be no
- * concurrent zgets on this object.
- */
- if (nzp != NULL)
- panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
/*
* Slap on VROOT if we are the root znode
@@ -532,14 +619,12 @@ zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
void
zfs_znode_dmu_fini(znode_t *zp)
{
- dmu_buf_t *db = zp->z_dbuf;
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
zp->z_unlinked ||
RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
- ASSERT(zp->z_dbuf != NULL);
- zp->z_dbuf = NULL;
- VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
- dmu_buf_rele(db, NULL);
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
}
/*
@@ -550,47 +635,67 @@ zfs_znode_dmu_fini(znode_t *zp)
* return the znode
*/
static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
{
znode_t *zp;
vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
ASSERT(zp->z_dirlocks == NULL);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
/*
* Defer setting z_zfsvfs until the znode is ready to be a candidate for
* the zfs_znode_move() callback.
*/
- zp->z_phys = NULL;
+ zp->z_sa_hdl = NULL;
zp->z_unlinked = 0;
zp->z_atime_dirty = 0;
zp->z_mapcnt = 0;
- zp->z_last_itx = 0;
zp->z_id = db->db_object;
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
vp = ZTOV(zp);
-#ifdef TODO
- vn_reinit(vp);
-#endif
- zfs_znode_dmu_init(zfsvfs, zp, db);
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, 8);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
- zp->z_gen = zp->z_phys->zp_gen;
+ zp->z_mode = mode;
-#if 0
- if (vp == NULL)
- return (zp);
-#endif
+ vp->v_type = IFTOVT((mode_t)mode);
- vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
switch (vp->v_type) {
case VDIR:
zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
@@ -598,8 +703,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
case VFIFO:
vp->v_op = &zfs_fifoops;
break;
- case VREG:
- if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
+ case VREG:
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
vp->v_op = &zfs_shareops;
}
break;
@@ -621,6 +727,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
return (zp);
}
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
/*
* Create a new DMU object to hold a zfs znode.
*
@@ -640,14 +749,23 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
*/
void
zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
{
- dmu_buf_t *db;
- znode_phys_t *pzp;
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ dmu_buf_t *db;
timestruc_t now;
uint64_t gen, obj;
int err;
+ int bonuslen;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t sa_attrs[ZPL_END];
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
@@ -661,12 +779,16 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
gen = dmu_tx_get_txg(tx);
}
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+
/*
* Create a new DMU object.
*/
/*
* There's currently no mechanism for pre-reading the blocks that will
- * be to needed allocate a new object, so we accept the small chance
+ * be needed to allocate a new object, so we accept the small chance
* that there will be an i/o error and we will fail one of the
* assertions below.
*/
@@ -674,103 +796,202 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (zfsvfs->z_replay) {
err = zap_create_claim_norm(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = zap_create_norm(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
} else {
if (zfsvfs->z_replay) {
err = dmu_object_claim(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
}
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
- dmu_buf_will_dirty(db, tx);
-
- /*
- * Initialize the znode physical data to zero.
- */
- ASSERT(db->db_size >= sizeof (znode_phys_t));
- bzero(db->db_data, db->db_size);
- pzp = db->db_data;
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
* to reference the just-allocated physical data area.
*/
if (flag & IS_ROOT_NODE) {
- dzp->z_dbuf = db;
- dzp->z_phys = pzp;
dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
}
/*
* If parent is an xattr, so am I.
*/
- if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ if (dzp_pflags & ZFS_XATTR) {
flag |= IS_XATTR;
-
- if (vap->va_type == VBLK || vap->va_type == VCHR) {
- pzp->zp_rdev = zfs_expldev(vap->va_rdev);
}
if (zfsvfs->z_use_fuids)
- pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
if (vap->va_type == VDIR) {
- pzp->zp_size = 2; /* contents ("." and "..") */
- pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
}
- pzp->zp_parent = dzp->z_id;
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
if (flag & IS_XATTR)
- pzp->zp_flags |= ZFS_XATTR;
+ pflags |= ZFS_XATTR;
- pzp->zp_gen = gen;
+ /*
+ * No execs denied will be deterimed when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
- ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
- ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (vap->va_mask & AT_ATIME) {
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&now, atime);
}
if (vap->va_mask & AT_MTIME) {
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&now, mtime);
}
- pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
if (!(flag & IS_ROOT_NODE)) {
- *zpp = zfs_znode_alloc(zfsvfs, db, 0);
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
} else {
/*
* If we are creating the root node, the "parent" we
* passed in is the znode for the root.
*/
*zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
}
- pzp->zp_uid = acl_ids->z_fuid;
- pzp->zp_gid = acl_ids->z_fgid;
- pzp->zp_mode = acl_ids->z_mode;
- VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+
if (vap->va_mask & AT_XVATTR)
- zfs_xvattr_set(*zpp, (xvattr_t *)vap);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
+ ASSERT3P(err, ==, 0);
+ }
if (!(flag & IS_ROOT_NODE)) {
vnode_t *vp;
@@ -780,10 +1001,16 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
vp->v_vflag &= ~VV_FORCEINSMQ;
KASSERT(err == 0, ("insmntque() failed: error %d", err));
}
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
}
+/*
+ * zfs_xvattr_set only updates the in-core attributes
+ * it is assumed the caller will be doing an sa_bulk_update
+ * to push the changes out
+ */
void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
{
xoptattr_t *xoap;
@@ -791,60 +1018,86 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
ASSERT(xoap);
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ &times, sizeof (times), tx);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
- xoap->xoa_av_quarantined);
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
- sizeof (xoap->xoa_av_scanstamp));
- zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+ zfs_sa_set_scanstamp(zp, xvap, tx);
XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
}
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
int
@@ -853,41 +1106,50 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
dmu_object_info_t doi;
dmu_buf_t *db;
znode_t *zp;
- vnode_t *vp;
- int err, first = 1;
+ int err;
+ sa_handle_t *hdl;
+ int first = 1;
*zpp = NULL;
+
again:
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- zp = dmu_buf_get_user(db);
- if (zp != NULL) {
- mutex_enter(&zp->z_lock);
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
/*
- * Since we do immediate eviction of the z_dbuf, we
- * should never find a dbuf with a znode that doesn't
- * know about the dbuf.
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
*/
- ASSERT3P(zp->z_dbuf, ==, db);
+
+ ASSERT3P(zp, !=, NULL);
+
+ mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_unlinked) {
err = ENOENT;
} else {
+ vnode_t *vp;
int dying = 0;
vp = ZTOV(zp);
@@ -915,7 +1177,7 @@ again:
* znode is dying so we can't reuse it, we must
* wait until destruction is completed.
*/
- dmu_buf_rele(db, NULL);
+ sa_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
if (vp != NULL)
@@ -926,7 +1188,7 @@ again:
*zpp = zp;
err = 0;
}
- dmu_buf_rele(db, NULL);
+ sa_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -938,24 +1200,30 @@ again:
*
* There is a small window where zfs_vget() could
* find this object while a file create is still in
- * progress. Since a gen number can never be zero
- * we will check that to determine if its an allocated
- * file.
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
*/
-
- if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = ENOENT;
+ } else {
*zpp = zp;
- vp = ZTOV(zp);
- vp->v_vflag |= VV_FORCEINSMQ;
+ }
+ if (err == 0) {
+ vnode_t *vp = ZTOV(zp);
+
err = insmntque(vp, zfsvfs->z_vfs);
- vp->v_vflag &= ~VV_FORCEINSMQ;
- KASSERT(err == 0, ("insmntque() failed: error %d", err));
- VOP_UNLOCK(vp, 0);
- err = 0;
- } else {
- dmu_buf_rele(db, NULL);
- err = ENOENT;
+ if (err == 0)
+ VOP_UNLOCK(vp, 0);
+ else {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ *zpp = NULL;
+ }
}
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -968,40 +1236,91 @@ zfs_rezget(znode_t *zp)
dmu_object_info_t doi;
dmu_buf_t *db;
uint64_t obj_num = zp->z_id;
+ uint64_t mode, size;
+ sa_bulk_attr_t bulk[8];
int err;
+ int count = 0;
+ uint64_t gen;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
- dmu_buf_rele(db, NULL);
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+ size = zp->z_size;
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, sizeof (zp->z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, sizeof (zp->z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
}
- mutex_enter(&zp->z_acl_lock);
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
+ zp->z_mode = mode;
+
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
+ }
+
+ /*
+ * XXXPJD: Not sure how is that possible, but under heavy
+ * zfs recv -F load it happens that z_gen is the same, but
+ * vnode type is different than znode type. This would mean
+ * that for example regular file was replaced with directory
+ * which has the same object number.
+ */
+ if (ZTOV(zp) != NULL &&
+ ZTOV(zp)->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
}
- mutex_exit(&zp->z_acl_lock);
- zfs_znode_dmu_init(zfsvfs, zp, db);
- zp->z_unlinked = (zp->z_phys->zp_links == 0);
+ zp->z_unlinked = (zp->z_links == 0);
zp->z_blksz = doi.doi_data_block_size;
+ if (zp->z_size != size && ZTOV(zp) != NULL)
+ vnode_pager_setsize(ZTOV(zp), zp->z_size);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1014,11 +1333,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
uint64_t obj = zp->z_id;
- uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ uint64_t acl_obj = zfs_external_acl(zp);
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- if (acl_obj)
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
VERIFY(0 == dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1033,7 +1354,7 @@ zfs_zinactive(znode_t *zp)
uint64_t z_id = zp->z_id;
int vfslocked;
- ASSERT(zp->z_dbuf && zp->z_phys);
+ ASSERT(zp->z_sa_hdl);
/*
* Don't allow a zfs_zget() while were trying to release this znode
@@ -1069,6 +1390,7 @@ zfs_zinactive(znode_t *zp)
VFS_UNLOCK_GIANT(vfslocked);
return;
}
+
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
}
@@ -1079,6 +1401,7 @@ zfs_znode_free(znode_t *zp)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(ZTOV(zp) == NULL);
+ ASSERT(zp->z_sa_hdl == NULL);
mutex_enter(&zfsvfs->z_znodes_lock);
POINTER_INVALIDATE(&zp->z_zfsvfs);
list_remove(&zfsvfs->z_all_znodes, zp);
@@ -1095,59 +1418,40 @@ zfs_znode_free(znode_t *zp)
}
void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
{
timestruc_t now;
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
gethrestime(&now);
- if (tx) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
zp->z_atime_dirty = 0;
zp->z_seq++;
} else {
zp->z_atime_dirty = 1;
}
- if (flag & AT_ATIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
if (flag & AT_MTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
- if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
}
if (flag & AT_CTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+ zp->z_pflags |= ZFS_ARCHIVE;
}
}
/*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- * 1 - Only the ACCESS time is ever updated outside of a transaction.
- * 2 - Multiple consecutive updates will be collapsed into a single
- * znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
- mutex_enter(&zp->z_lock);
- zfs_time_stamper_locked(zp, flag, tx);
- mutex_exit(&zp->z_lock);
-}
-
-/*
* Grow the block size for a file.
*
* IN: zp - znode of file to free data in.
@@ -1169,19 +1473,36 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
* we will not grow. If there is more than one block in a file,
* the blocksize cannot change.
*/
- if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
return;
error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
size, 0, tx);
+
if (error == ENOTSUP)
return;
ASSERT3U(error, ==, 0);
/* What blocksize did we actually get? */
- dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
}
+#ifdef sun
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage(). E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+ int flags, cred_t *cr)
+{
+ ASSERT(0);
+ return (0);
+}
+#endif /* sun */
+
/*
* Increase the file length
*
@@ -1208,13 +1529,14 @@ zfs_extend(znode_t *zp, uint64_t end)
/*
* Nothing to do if file already at desired length.
*/
- if (end <= zp->z_phys->zp_size) {
+ if (end <= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
if (end > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
/*
@@ -1242,19 +1564,21 @@ top:
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
if (newblksz)
zfs_grow_blocksize(zp, newblksz, tx);
- zp->z_phys->zp_size = end;
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ vnode_pager_setsize(ZTOV(zp), end);
zfs_range_unlock(rl);
dmu_tx_commit(tx);
- vnode_pager_setsize(ZTOV(zp), end);
-
return (0);
}
@@ -1283,20 +1607,21 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
/*
* Nothing to do if file already at desired length.
*/
- if (off >= zp->z_phys->zp_size) {
+ if (off >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
- if (off + len > zp->z_phys->zp_size)
- len = zp->z_phys->zp_size - off;
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
if (error == 0) {
/*
* In FreeBSD we cannot free block in the middle of a file,
- * but only at the end of a file.
+ * but only at the end of a file, so this code path should
+ * never happen.
*/
vnode_pager_setsize(ZTOV(zp), off);
}
@@ -1323,6 +1648,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
dmu_tx_t *tx;
rl_t *rl;
int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
/*
* We will change zp_size, lock the whole file.
@@ -1332,7 +1659,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
/*
* Nothing to do if file already at desired length.
*/
- if (end >= zp->z_phys->zp_size) {
+ if (end >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
@@ -1344,7 +1671,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1356,9 +1684,17 @@ top:
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_size = end;
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
dmu_tx_commit(tx);
@@ -1394,9 +1730,17 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
int error;
- if (off > zp->z_phys->zp_size) {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
error = zfs_extend(zp, off+len);
if (error == 0 && log)
goto log;
@@ -1404,18 +1748,29 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
return (error);
}
+ /*
+ * Check for any locks in the region to be freed.
+ */
+
+ if (MANDLOCK(vp, (mode_t)mode)) {
+ uint64_t length = (len ? len : zp->z_size - off);
+ if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+ return (error);
+ }
+
if (len == 0) {
error = zfs_trunc(zp, off);
} else {
if ((error = zfs_free_range(zp, off, len)) == 0 &&
- off + len > zp->z_phys->zp_size)
+ off + len > zp->z_size)
error = zfs_extend(zp, off+len);
}
if (error || !log)
return (error);
log:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1427,7 +1782,14 @@ log:
return (error);
}
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
dmu_tx_commit(tx);
@@ -1438,7 +1800,7 @@ void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
zfsvfs_t zfsvfs;
- uint64_t moid, obj, version;
+ uint64_t moid, obj, sa_obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0;
nvpair_t *elem;
@@ -1465,12 +1827,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
/*
* Set starting attributes.
*/
- if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
- version = ZPL_VERSION;
- else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
- version = ZPL_VERSION_USERSPACE - 1;
- else
- version = ZPL_VERSION_FUID - 1;
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
elem = NULL;
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */
@@ -1496,6 +1853,18 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
/*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
* Create a delete queue.
*/
obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
@@ -1514,22 +1883,32 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vattr.va_uid = crgetuid(cr);
vattr.va_gid = crgetgid(cr);
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
zfs_znode_cache_constructor(rootzp, NULL, 0);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
vnode.v_type = VDIR;
vnode.v_data = rootzp;
rootzp->z_vnode = &vnode;
- bzero(&zfsvfs, sizeof (zfsvfs_t));
-
zfsvfs.z_os = os;
zfsvfs.z_parent = &zfsvfs;
zfsvfs.z_version = version;
zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs.z_use_sa = USE_SA(version, os);
zfsvfs.z_norm = norm;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs.z_attr_table);
+
+ ASSERT(error == 0);
+
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
@@ -1544,19 +1923,17 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
ASSERT(error == 0);
zfs_acl_ids_free(&acl_ids);
POINTER_INVALIDATE(&rootzp->z_zfsvfs);
- dmu_buf_rele(rootzp->z_dbuf, NULL);
- rootzp->z_dbuf = NULL;
+ sa_handle_destroy(rootzp->z_sa_hdl);
rootzp->z_vnode = NULL;
kmem_cache_free(znode_cache, rootzp);
@@ -1573,44 +1950,122 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
#endif /* _KERNEL */
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
+
static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
{
- dmu_buf_t *db;
dmu_object_info_t doi;
- znode_phys_t *zp;
int error;
- if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
return (error);
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ doi.doi_bonus_type == DMU_OT_ZNODE &&
doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, FTAG);
- return (EINVAL);
+ sa_buf_rele(*db, tag);
+ return (ENOTSUP);
}
- zp = db->db_data;
- *pobjp = zp->zp_parent;
- *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
- S_ISDIR(zp->zp_mode);
- dmu_buf_rele(db, FTAG);
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
return (0);
}
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp,
+ int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ *pobjp = parent;
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
char *path = buf + len - 1;
int error;
*path = '\0';
+ sa_hdl = hdl;
for (;;) {
uint64_t pobj;
@@ -1618,7 +2073,10 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
size_t complen;
int is_xattrdir;
- if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+ if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
&is_xattrdir)) != 0)
break;
@@ -1643,9 +2101,80 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
ASSERT(path >= buf);
bcopy(component, path, complen);
obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
}
if (error == 0)
(void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 490e50fbb94c..5c7b22e2a3f3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -19,13 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/zfs_context.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -34,8 +34,9 @@
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@@ -66,11 +67,11 @@
/*
* This global ZIL switch affects all pools
*/
-int zil_disable = 0; /* disable intent logging */
+int zil_replay_disable = 0; /* disable intent logging replay */
SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RW, &zil_disable, 0,
- "Disable ZFS Intent Log (ZIL)");
+TUNABLE_INT("vfs.zfs.zil_replay_disable", &zil_replay_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RW,
+ &zil_replay_disable, 0, "Disable intent logging replay");
/*
* Tunable parameter for debugging or performance analysis. Setting
@@ -84,11 +85,26 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
static kmem_cache_t *zil_lwb_cache;
+static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
+
+#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
{
- const dva_t *dva1 = x1;
- const dva_t *dva2 = x2;
+ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
return (-1);
@@ -104,34 +120,37 @@ zil_dva_compare(const void *x1, const void *x2)
}
static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
{
- avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
- offsetof(zil_dva_node_t, zn_node));
+ avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
}
static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ zil_bp_node_t *zn;
void *cookie = NULL;
while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(zn, sizeof (zil_dva_node_t));
+ kmem_free(zn, sizeof (zil_bp_node_t));
avl_destroy(t);
}
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ const dva_t *dva = BP_IDENTITY(bp);
+ zil_bp_node_t *zn;
avl_index_t where;
if (avl_find(t, dva, &where) != NULL)
return (EEXIST);
- zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
zn->zn_dva = *dva;
avl_insert(t, zn, where);
@@ -156,35 +175,31 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
}
/*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
*/
static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+ char **end)
{
- blkptr_t blk = *bp;
- zbookmark_t zb;
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_t zb;
int error;
- zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
- *abufpp = NULL;
+ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
- /*
- * We shouldn't be doing any scrubbing while we're doing log
- * replay, it's OK to not lock.
- */
- error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
- arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
- char *data = (*abufpp)->b_data;
- uint64_t blksz = BP_GET_LSIZE(bp);
- zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
zio_cksum_t cksum = bp->blk_cksum;
/*
@@ -197,43 +212,102 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
- if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
- (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
- error = ECKSUM;
- }
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = abuf->b_data;
+ char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
- if (error) {
- VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
- *abufpp = NULL;
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, len);
+ *end = (char *)dst + len;
+ *nbp = zilc->zc_next_blk;
+ }
+ } else {
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ (zilc->zc_nused > (size - sizeof (*zilc)))) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
+ *nbp = zilc->zc_next_blk;
+ }
}
+
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+ return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_t zb;
+ int error;
+
+ if (BP_IS_HOLE(bp)) {
+ if (wbuf != NULL)
+ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ return (0);
+ }
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ if (wbuf != NULL)
+ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ }
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
*/
-uint64_t
+int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
{
const zil_header_t *zh = zilog->zl_header;
- uint64_t claim_seq = zh->zh_claim_seq;
- uint64_t seq = 0;
- uint64_t max_seq = 0;
- blkptr_t blk = zh->zh_log;
- arc_buf_t *abuf;
+ boolean_t claimed = !!zh->zh_claim_txg;
+ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+ uint64_t max_blk_seq = 0;
+ uint64_t max_lr_seq = 0;
+ uint64_t blk_count = 0;
+ uint64_t lr_count = 0;
+ blkptr_t blk, next_blk;
char *lrbuf, *lrp;
- zil_trailer_t *ztp;
- int reclen, error;
+ int error = 0;
- if (BP_IS_HOLE(&blk))
- return (max_seq);
+ /*
+ * Old logs didn't record the maximum zh_claim_lr_seq.
+ */
+ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ claim_lr_seq = UINT64_MAX;
/*
* Starting at the block pointed to by zh_log we read the log chain.
@@ -244,105 +318,156 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- zil_dva_tree_init(&zilog->zl_dva_tree);
- for (;;) {
- seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ zil_bp_tree_init(zilog);
- if (claim_seq != 0 && seq > claim_seq)
- break;
-
- ASSERT(max_seq < seq);
- max_seq = seq;
+ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ int reclen;
+ char *end;
- error = zil_read_log_block(zilog, &blk, &abuf);
+ if (blk_seq > claim_blk_seq)
+ break;
+ if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+ break;
+ ASSERT3U(max_blk_seq, <, blk_seq);
+ max_blk_seq = blk_seq;
+ blk_count++;
- if (parse_blk_func != NULL)
- parse_blk_func(zilog, &blk, arg, txg);
+ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+ break;
+ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
if (error)
break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
-
- if (parse_lr_func == NULL) {
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- continue;
- }
-
- for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- parse_lr_func(zilog, lr, arg, txg);
+ if (lr->lrc_seq > claim_lr_seq)
+ goto done;
+ if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+ goto done;
+ ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+ max_lr_seq = lr->lrc_seq;
+ lr_count++;
}
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+ zilog->zl_parse_error = error;
+ zilog->zl_parse_blk_seq = max_blk_seq;
+ zilog->zl_parse_lr_seq = max_lr_seq;
+ zilog->zl_parse_blk_count = blk_count;
+ zilog->zl_parse_lr_count = lr_count;
+
+ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+ zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
- return (max_seq);
+ return (error);
}
-/* ARGSUSED */
-static void
+static int
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
- spa_t *spa = zilog->zl_spa;
- int err;
-
/*
* Claim log block if not already committed and not already claimed.
+ * If tx == NULL, just verify that the block is claimable.
*/
- if (bp->blk_birth >= first_txg &&
- zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
- err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
- ASSERT(err == 0);
- }
+ if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
}
-static void
+static int
zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
{
- if (lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
- }
+ lr_write_t *lr = (lr_write_t *)lrc;
+ int error;
+
+ if (lrc->lrc_txtype != TX_WRITE)
+ return (0);
+
+ /*
+ * If the block is not readable, don't claim it. This can happen
+ * in normal operation when a log block is written to disk before
+ * some of the dmu_sync() blocks it points to. In this case, the
+ * transaction cannot have been committed to anyone (we would have
+ * waited for all writes to be stable first), so it is semantically
+ * correct to declare this the end of the log.
+ */
+ if (lr->lr_blkptr.blk_birth >= first_txg &&
+ (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+ return (error);
+ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
/* ARGSUSED */
-static void
+static int
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
- zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+ zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
}
-static void
+static int
zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
/*
* If we previously claimed it, we need to free it.
*/
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- if (bp->blk_birth >= claim_txg &&
- !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
- (void) arc_free(NULL, zilog->zl_spa,
- dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
- }
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+ lwb_t *lwb;
+
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = *bp;
+ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+ lwb->lwb_tx = NULL;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ return (lwb);
}
/*
* Create an on-disk intent log.
*/
-static void
+static lwb_t *
zil_create(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
- lwb_t *lwb;
+ lwb_t *lwb = NULL;
uint64_t txg = 0;
dmu_tx_t *tx = NULL;
blkptr_t blk;
@@ -359,22 +484,23 @@ zil_create(zilog_t *zilog)
blk = zh->zh_log;
/*
- * If we don't already have an initial log block or we have one
- * but it's the wrong endianness then allocate one.
+ * Allocate an initial log block if:
+ * - there isn't one already
+ * - the existing block is the wrong endianess
*/
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
- zio_free_blk(zilog->zl_spa, &blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
- error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
- NULL, txg);
+ error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+ ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -383,20 +509,8 @@ zil_create(zilog_t *zilog)
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
- if (error == 0) {
- lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
- lwb->lwb_zilog = zilog;
- lwb->lwb_blk = blk;
- lwb->lwb_nused = 0;
- lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
- lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
- lwb->lwb_max_txg = txg;
- lwb->lwb_zio = NULL;
-
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, lwb);
- mutex_exit(&zilog->zl_lock);
- }
+ if (error == 0)
+ lwb = zil_alloc_lwb(zilog, &blk, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -409,6 +523,8 @@ zil_create(zilog_t *zilog)
}
ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+ return (lwb);
}
/*
@@ -433,26 +549,18 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+ zilog->zl_old_header = *zh; /* debugging aid */
+
if (BP_IS_HOLE(&zh->zh_log))
return;
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
mutex_enter(&zilog->zl_lock);
- /*
- * It is possible for the ZIL to get the previously mounted zilog
- * structure of the same dataset if quickly remounted and the dbuf
- * eviction has not completed. In this case we can see a non
- * empty lwb list and keep_first will be set. We fix this by
- * clearing the keep_first. This will be slower but it's very rare.
- */
- if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
- keep_first = B_FALSE;
-
ASSERT3U(zilog->zl_destroy_txg, <, txg);
zilog->zl_destroy_txg = txg;
zilog->zl_keep_first = keep_first;
@@ -464,41 +572,20 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
list_remove(&zilog->zl_lwb_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
}
- } else {
- if (!keep_first) {
- (void) zil_parse(zilog, zil_free_log_block,
- zil_free_log_record, tx, zh->zh_claim_txg);
- }
+ } else if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
}
mutex_exit(&zilog->zl_lock);
dmu_tx_commit(tx);
}
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- arc_buf_t *abuf = NULL;
-
- if (BP_IS_HOLE(&zh->zh_log))
- return (B_TRUE);
-
- if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
- return (B_TRUE);
-
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- return (B_FALSE);
-}
-
int
-zil_claim(char *osname, void *txarg)
+zil_claim(const char *osname, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -507,7 +594,7 @@ zil_claim(char *osname, void *txarg)
objset_t *os;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
@@ -516,28 +603,13 @@ zil_claim(char *osname, void *txarg)
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
- if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
if (!BP_IS_HOLE(&zh->zh_log))
- zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+ zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
- }
-
- /*
- * Record here whether the zil has any records to replay.
- * If the header block pointer is null or the block points
- * to the stubby then we know there are no valid log records.
- * We use the header to store this state as the the zilog gets
- * freed later in dmu_objset_close().
- * The flags (and the rest of the header fields) are cleared in
- * zil_sync() as a result of a zil_destroy(), after replaying the log.
- *
- * Note, the intent log can be empty but still need the
- * stubby to be claimed.
- */
- if (!zil_empty(zilog)) {
- zh->zh_flags |= ZIL_REPLAY_NEEDED;
- dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ dmu_objset_rele(os, FTAG);
+ return (0);
}
/*
@@ -549,14 +621,19 @@ zil_claim(char *osname, void *txarg)
*/
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
- zh->zh_claim_txg = first_txg;
- zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+ (void) zil_parse(zilog, zil_claim_log_block,
zil_claim_log_record, tx, first_txg);
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (0);
}
@@ -565,53 +642,67 @@ zil_claim(char *osname, void *txarg)
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
-/* ARGSUSED */
int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(const char *osname, void *tx)
{
zilog_t *zilog;
- zil_header_t *zh;
- blkptr_t blk;
- arc_buf_t *abuf;
objset_t *os;
- char *lrbuf;
- zil_trailer_t *ztp;
+ blkptr_t *bp;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ ASSERT(tx == NULL);
+
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
}
zilog = dmu_objset_zil(os);
- zh = zil_header_in_syncing_context(zilog);
- blk = zh->zh_log;
- if (BP_IS_HOLE(&blk)) {
- dmu_objset_close(os);
- return (0); /* no chain */
- }
+ bp = (blkptr_t *)&zilog->zl_header->zh_log;
- for (;;) {
- error = zil_read_log_block(zilog, &blk, &abuf);
- if (error)
- break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the log
+ * as its content should have already been synced to the pool.
+ */
+ if (!BP_IS_HOLE(bp)) {
+ vdev_t *vd;
+ boolean_t valid = B_TRUE;
+
+ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (vd->vdev_islog && vdev_is_dead(vd))
+ valid = vdev_log_state_valid(vd);
+ spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+ if (!valid) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
}
- dmu_objset_close(os);
- if (error == ECKSUM)
- return (0); /* normal end of chain */
- return (error);
+
+ /*
+ * Because tx == NULL, zil_claim_log_block() will not actually claim
+ * any blocks, but just determine whether it is possible to do so.
+ * In addition to checking the log chain, zil_claim_log_block()
+ * will invoke zio_claim() with a done func of spa_claim_notify(),
+ * which will update spa_max_claim_txg. See spa_load() for details.
+ */
+ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+ zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+
+ dmu_objset_rele(os, FTAG);
+
+ return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
static int
zil_vdev_compare(const void *x1, const void *x2)
{
- uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
- uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
if (v1 < v2)
return (-1);
@@ -622,7 +713,7 @@ zil_vdev_compare(const void *x1, const void *x2)
}
void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_vdev_tree;
avl_index_t where;
@@ -652,7 +743,7 @@ zil_add_block(zilog_t *zilog, blkptr_t *bp)
mutex_exit(&zilog->zl_vdev_lock);
}
-void
+static void
zil_flush_vdevs(zilog_t *zilog)
{
spa_t *spa = zilog->zl_spa;
@@ -698,9 +789,9 @@ zil_lwb_write_done(zio_t *zio)
{
lwb_t *lwb = zio->io_private;
zilog_t *zilog = lwb->lwb_zilog;
+ dmu_tx_t *tx = lwb->lwb_tx;
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -719,17 +810,15 @@ zil_lwb_write_done(zio_t *zio)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
- if (zio->io_error)
- zilog->zl_log_error = B_TRUE;
+ lwb->lwb_tx = NULL;
+ mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync. We still have the
- * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+ * which we allocated the next block sync.
*/
- txg_rele_to_sync(&lwb->lwb_txgh);
- mutex_exit(&zilog->zl_lock);
+ dmu_tx_commit(tx);
}
/*
@@ -740,10 +829,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
{
zbookmark_t zb;
- zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
if (zilog->zl_root_zio == NULL) {
zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -751,118 +839,147 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
}
if (lwb->lwb_zio == NULL) {
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+ 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
/*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+ 4096, /* non TX_WRITE */
+ 8192+4096, /* data base */
+ 32*1024 + 4096, /* NFS writes */
+ UINT64_MAX
+};
+
+/*
+ * Use the slog as long as the logbias is 'latency' and the current commit size
+ * is less than the limit or the total list size is less than 2X the limit.
+ * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ */
+uint64_t zil_slog_limit = 1024 * 1024;
+#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
+ (((zilog)->zl_cur_used < zil_slog_limit) || \
+ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+
+/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *nlwb;
- zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ lwb_t *nlwb = NULL;
+ zil_chain_t *zilc;
spa_t *spa = zilog->zl_spa;
- blkptr_t *bp = &ztp->zit_next_blk;
+ blkptr_t *bp;
+ dmu_tx_t *tx;
uint64_t txg;
- uint64_t zil_blksz;
- int error;
+ uint64_t zil_blksz, wsz;
+ int i, error;
- ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
+ }
+
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
- * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+ * We dirty the dataset to ensure that zil_sync() will be called
+ * to clean up in the event of allocation failure or I/O failure.
*/
- txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
- txg_rele_to_quiesce(&lwb->lwb_txgh);
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ lwb->lwb_tx = tx;
/*
- * Pick a ZIL blocksize. We request a size that is the
- * maximum of the previous used size, the current used size and
- * the amount waiting in the queue.
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on size used in the last block.
+ * - first find the smallest bucket that will fit the block from a
+ * limited set of block sizes. This is because it's faster to write
+ * blocks allocated from the same metaslab as they are adjacent or
+ * close.
+ * - next find the maximum from the new suggested size and an array of
+ * previous sizes. This lessens a picket fence effect of wrongly
+ * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+ * requests.
+ *
+ * Note we only write what is used, but we can't just allocate
+ * the maximum block size because we can exhaust the available
+ * pool log space.
*/
- zil_blksz = MAX(zilog->zl_prev_used,
- zilog->zl_cur_used + sizeof (*ztp));
- zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
- zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
- if (zil_blksz > ZIL_MAX_BLKSZ)
- zil_blksz = ZIL_MAX_BLKSZ;
+ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+ continue;
+ zil_blksz = zil_block_buckets[i];
+ if (zil_blksz == UINT64_MAX)
+ zil_blksz = SPA_MAXBLOCKSIZE;
+ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+ for (i = 0; i < ZIL_PREV_BLKS; i++)
+ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
- if (error) {
- dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
- /*
- * We dirty the dataset to ensure that zil_sync() will
- * be called to remove this lwb from our zl_lwb_list.
- * Failing to do so, may leave an lwb with a NULL lwb_buf
- * hanging around on the zl_lwb_list.
- */
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- dmu_tx_commit(tx);
+ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
+ USE_SLOG(zilog));
+ if (!error) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
/*
- * Since we've just experienced an allocation failure so we
- * terminate the current lwb and send it on its way.
+ * Allocate a new log write buffer (lwb).
*/
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- zio_nowait(lwb->lwb_zio);
+ nlwb = zil_alloc_lwb(zilog, bp, txg);
- /*
- * By returning NULL the caller will call tx_wait_synced()
- */
- return (NULL);
+ /* Record the block for later vdev flushing */
+ zil_add_block(zilog, &lwb->lwb_blk);
}
- ASSERT3U(bp->blk_birth, ==, txg);
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_zio, wsz);
- /*
- * Allocate a new log write buffer (lwb).
- */
- nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ } else {
+ wsz = lwb->lwb_sz;
+ }
- nlwb->lwb_zilog = zilog;
- nlwb->lwb_blk = *bp;
- nlwb->lwb_nused = 0;
- nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
- nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
- nlwb->lwb_max_txg = txg;
- nlwb->lwb_zio = NULL;
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
/*
- * Put new lwb at the end of the log chain
+ * clear unused data for security
*/
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, nlwb);
- mutex_exit(&zilog->zl_lock);
+ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
- /* Record the block for later vdev flushing */
- zil_add_block(zilog, &lwb->lwb_blk);
+ zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
/*
- * kick off the write for the old log block
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
*/
- dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
- ASSERT(lwb->lwb_zio);
- zio_nowait(lwb->lwb_zio);
-
return (nlwb);
}
@@ -870,20 +987,20 @@ static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lr = (lr_write_t *)lrc;
+ lr_write_t *lrw = (lr_write_t *)lrc;
+ char *lr_buf;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen;
+ uint64_t dlen = 0;
if (lwb == NULL)
return (NULL);
+
ASSERT(lwb->lwb_buf != NULL);
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
dlen = P2ROUNDUP_TYPED(
- lr->lr_length, sizeof (uint64_t), uint64_t);
- else
- dlen = 0;
+ lrw->lr_length, sizeof (uint64_t), uint64_t);
zilog->zl_cur_used += (reclen + dlen);
@@ -892,24 +1009,22 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
/*
* If this record won't fit in the current log block, start a new one.
*/
- if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_init(zilog, lwb);
- ASSERT(lwb->lwb_nused == 0);
- if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ ASSERT(LWB_EMPTY(lwb));
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
}
}
- /*
- * Update the lrc_seq, to be log record sequence number. See zil.h
- * Then copy the record to the log buffer.
- */
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ bcopy(lrc, lr_buf, reclen);
+ lrc = (lr_t *)lr_buf;
+ lrw = (lr_write_t *)lrc;
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@@ -921,18 +1036,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
char *dbuf;
int error;
- /* alignment is guaranteed */
- lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
if (dlen) {
ASSERT(itx->itx_wr_state == WR_NEED_COPY);
- dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
- lr->lr_common.lrc_reclen += dlen;
+ dbuf = lr_buf + reclen;
+ lrw->lr_common.lrc_reclen += dlen;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
error = zilog->zl_get_data(
- itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ itx->itx_private, lrw, dbuf, lwb->lwb_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
@@ -945,9 +1058,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
}
}
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
lwb->lwb_nused += reclen + dlen;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
- ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
return (lwb);
@@ -965,247 +1085,446 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
itx->itx_lr.lrc_reclen = lrsize;
itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
+ itx->itx_sync = B_TRUE; /* default is synchronous */
return (itx);
}
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+void
+zil_itx_destroy(itx_t *itx)
{
- uint64_t seq;
+ kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
- ASSERT(itx->itx_lr.lrc_seq == 0);
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
+{
+ itx_t *itx;
+ list_t *list;
+ avl_tree_t *t;
+ void *cookie;
+ itx_async_node_t *ian;
+
+ list = &itxs->i_sync_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz += itx->itx_sod;
- itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
- itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
+ cookie = NULL;
+ t = &itxs->i_async_tree;
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list = &ian->ia_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ avl_destroy(t);
+
+ kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
+
+ if (o1 < o2)
+ return (-1);
+ if (o1 > o2)
+ return (1);
- return (seq);
+ return (0);
}
/*
- * Free up all in-memory intent log transactions that have now been synced.
+ * Remove all async itx with the given oid.
*/
static void
-zil_itx_clean(zilog_t *zilog)
+zil_remove_async(zilog_t *zilog, uint64_t oid)
{
- uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
- uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
list_t clean_list;
itx_t *itx;
+ ASSERT(oid != 0);
list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
- mutex_enter(&zilog->zl_lock);
- /* wait for a log writer to finish walking list */
- while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- }
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
- /*
- * Move the sync'd log transactions to a separate list so we can call
- * kmem_free without holding the zl_lock.
- *
- * There is no need to set zl_writer as we don't drop zl_lock here
- */
- while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
- itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
- list_insert_tail(&clean_list, itx);
- }
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
- /* destroy sync'd log transactions */
+ /*
+ * Locate the object node and append its list.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ ian = avl_find(t, &oid, &where);
+ if (ian != NULL)
+ list_move_tail(&clean_list, &ian->ia_list);
+ mutex_exit(&itxg->itxg_lock);
+ }
while ((itx = list_head(&clean_list)) != NULL) {
list_remove(&clean_list, itx);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
}
list_destroy(&clean_list);
}
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t txg;
+ itxg_t *itxg;
+ itxs_t *itxs, *clean = NULL;
+
+ /*
+ * Object ids can be re-instantiated in the next txg so
+ * remove any async transactions to avoid future leaks.
+ * This can happen if a fsync occurs on the re-instantiated
+ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+ * the new file data and flushes a write record for the old object.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+ zil_remove_async(zilog, itx->itx_oid);
+
+ /*
+ * Ensure the data of a renamed file is committed before the rename.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+ zil_async_to_sync(zilog, itx->itx_oid);
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+ txg = ZILTEST_TXG;
+ else
+ txg = dmu_tx_get_txg(tx);
+
+ itxg = &zilog->zl_itxg[txg & TXG_MASK];
+ mutex_enter(&itxg->itxg_lock);
+ itxs = itxg->itxg_itxs;
+ if (itxg->itxg_txg != txg) {
+ if (itxs != NULL) {
+ /*
+ * The zil_clean callback hasn't got around to cleaning
+ * this itxg. Save the itxs for release below.
+ * This should be rare.
+ */
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean = itxg->itxg_itxs;
+ }
+ ASSERT(itxg->itxg_sod == 0);
+ itxg->itxg_txg = txg;
+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+ list_create(&itxs->i_sync_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ avl_create(&itxs->i_async_tree, zil_aitx_compare,
+ sizeof (itx_async_node_t),
+ offsetof(itx_async_node_t, ia_node));
+ }
+ if (itx->itx_sync) {
+ list_insert_tail(&itxs->i_sync_list, itx);
+ atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
+ itxg->itxg_sod += itx->itx_sod;
+ } else {
+ avl_tree_t *t = &itxs->i_async_tree;
+ uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ itx_async_node_t *ian;
+ avl_index_t where;
+
+ ian = avl_find(t, &foid, &where);
+ if (ian == NULL) {
+ ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+ list_create(&ian->ia_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ ian->ia_foid = foid;
+ avl_insert(t, ian, where);
+ }
+ list_insert_tail(&ian->ia_list, itx);
+ }
+
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ mutex_exit(&itxg->itxg_lock);
+
+ /* Release the old itxs now we've dropped the lock */
+ if (clean != NULL)
+ zil_itxg_clean(clean);
+}
+
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
-zil_clean(zilog_t *zilog)
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
{
- itx_t *itx;
+ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+ itxs_t *clean_me;
- mutex_enter(&zilog->zl_lock);
- itx = list_head(&zilog->zl_itx_list);
- if ((itx != NULL) &&
- (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
- (void) taskq_dispatch(zilog->zl_clean_taskq,
- (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+ mutex_exit(&itxg->itxg_lock);
+ return;
+ }
+ ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+ ASSERT(itxg->itxg_txg != 0);
+ ASSERT(zilog->zl_clean_taskq != NULL);
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean_me = itxg->itxg_itxs;
+ itxg->itxg_itxs = NULL;
+ itxg->itxg_txg = 0;
+ mutex_exit(&itxg->itxg_lock);
+ /*
+ * Preferably start a task queue to free up the old itxs but
+ * if taskq_dispatch can't allocate resources to do that then
+ * free it in-line. This should be rare. Note, using TQ_SLEEP
+ * created a bad performance problem.
+ */
+ if (taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
+ zil_itxg_clean(clean_me);
+}
+
+/*
+ * Get the list of itxs to commit into zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+ uint64_t otxg, txg;
+ list_t *commit_list = &zilog->zl_itx_commit_list;
+ uint64_t push_sod = 0;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+ push_sod += itxg->itxg_sod;
+ itxg->itxg_sod = 0;
+
+ mutex_exit(&itxg->itxg_lock);
+ }
+ atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+static void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If a foid is specified then find that node and append its
+ * list. Otherwise walk the tree appending all the lists
+ * to the sync list. We add to the end rather than the
+ * beginning to ensure the create has happened.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ if (foid != 0) {
+ ian = avl_find(t, &foid, &where);
+ if (ian != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ }
+ } else {
+ void *cookie = NULL;
+
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ list_destroy(&ian->ia_list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ }
+ mutex_exit(&itxg->itxg_lock);
}
- mutex_exit(&zilog->zl_lock);
}
static void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit_writer(zilog_t *zilog)
{
uint64_t txg;
- uint64_t commit_seq = 0;
- itx_t *itx, *itx_next = (itx_t *)-1;
+ itx_t *itx;
lwb_t *lwb;
- spa_t *spa;
+ spa_t *spa = zilog->zl_spa;
+ int error = 0;
- zilog->zl_writer = B_TRUE;
ASSERT(zilog->zl_root_zio == NULL);
- spa = zilog->zl_spa;
+
+ mutex_exit(&zilog->zl_lock);
+
+ zil_get_commit_list(zilog);
+
+ /*
+ * Return if there's nothing to commit before we dirty the fs by
+ * calling zil_create().
+ */
+ if (list_head(&zilog->zl_itx_commit_list) == NULL) {
+ mutex_enter(&zilog->zl_lock);
+ return;
+ }
if (zilog->zl_suspend) {
lwb = NULL;
} else {
lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- /*
- * Return if there's nothing to flush before we
- * dirty the fs by calling zil_create()
- */
- if (list_is_empty(&zilog->zl_itx_list)) {
- zilog->zl_writer = B_FALSE;
- return;
- }
- mutex_exit(&zilog->zl_lock);
- zil_create(zilog);
- mutex_enter(&zilog->zl_lock);
- lwb = list_tail(&zilog->zl_lwb_list);
- }
+ if (lwb == NULL)
+ lwb = zil_create(zilog);
}
- /* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
- for (;;) {
- /*
- * Find the next itx to push:
- * Push all transactions related to specified foid and all
- * other transactions except TX_WRITE, TX_TRUNCATE,
- * TX_SETATTR and TX_ACL for all other files.
- */
- if (itx_next != (itx_t *)-1)
- itx = itx_next;
- else
- itx = list_head(&zilog->zl_itx_list);
- for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
- if (foid == 0) /* push all foids? */
- break;
- if (itx->itx_sync) /* push all O_[D]SYNC */
- break;
- switch (itx->itx_lr.lrc_txtype) {
- case TX_SETATTR:
- case TX_WRITE:
- case TX_TRUNCATE:
- case TX_ACL:
- /* lr_foid is same offset for these records */
- if (((lr_write_t *)&itx->itx_lr)->lr_foid
- != foid) {
- continue; /* skip this record */
- }
- }
- break;
- }
- if (itx == NULL)
- break;
-
- if ((itx->itx_lr.lrc_seq > seq) &&
- ((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
- break;
- }
-
- /*
- * Save the next pointer. Even though we soon drop
- * zl_lock all threads that may change the list
- * (another writer or zil_itx_clean) can't do so until
- * they have zl_writer.
- */
- itx_next = list_next(&zilog->zl_itx_list, itx);
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
- mutex_exit(&zilog->zl_lock);
+ while (itx = list_head(&zilog->zl_itx_commit_list)) {
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
- if (txg > spa_last_synced_txg(spa) ||
- txg > spa_freeze_txg(spa))
+ if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
+ list_remove(&zilog->zl_itx_commit_list, itx);
kmem_free(itx, offsetof(itx_t, itx_lr)
+ itx->itx_lr.lrc_reclen);
- mutex_enter(&zilog->zl_lock);
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
- /* determine commit sequence number */
- itx = list_head(&zilog->zl_itx_list);
- if (itx)
- commit_seq = itx->itx_lr.lrc_seq;
- else
- commit_seq = zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
/* write the last block out */
if (lwb != NULL && lwb->lwb_zio != NULL)
lwb = zil_lwb_write_start(zilog, lwb);
- zilog->zl_prev_used = zilog->zl_cur_used;
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
- DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
- (void) zio_wait(zilog->zl_root_zio);
+ error = zio_wait(zilog->zl_root_zio);
zilog->zl_root_zio = NULL;
- DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
zil_flush_vdevs(zilog);
}
- if (zilog->zl_log_error || lwb == NULL) {
- zilog->zl_log_error = 0;
+ if (error || lwb == NULL)
txg_wait_synced(zilog->zl_dmu_pool, 0);
- }
mutex_enter(&zilog->zl_lock);
- zilog->zl_writer = B_FALSE;
- ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
- zilog->zl_commit_seq = commit_seq;
+ /*
+ * Remember the highest committed log sequence number for ztest.
+ * We only update this value when all the log writes succeeded,
+ * because ztest wants to ASSERT that it got the whole log chain.
+ */
+ if (error == 0 && lwb != NULL)
+ zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
/*
- * Push zfs transactions to stable storage up to the supplied sequence number.
+ * Commit zfs transactions to stable storage.
* If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
+ * for that object or might reference that object.
+ *
+ * itxs are committed in batches. In a heavily stressed zil there will be
+ * a commit writer thread who is writing out a bunch of itxs to the log
+ * for a set of committing threads (cthreads) in the same batch as the writer.
+ * Those cthreads are all waiting on the same cv for that batch.
+ *
+ * There will also be a different and growing batch of threads that are
+ * waiting to commit (qthreads). When the committing batch completes
+ * a transition occurs such that the cthreads exit and the qthreads become
+ * cthreads. One of the new cthreads becomes the writer thread for the
+ * batch. Any new threads arriving become new qthreads.
+ *
+ * Only 2 condition variables are needed and there's no transition
+ * between the two cvs needed. They just flip-flop between qthreads
+ * and cthreads.
+ *
+ * Using this scheme we can efficiently wakeup up only those threads
+ * that have been committed.
*/
void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit(zilog_t *zilog, uint64_t foid)
{
- if (zilog == NULL || seq == 0)
- return;
+ uint64_t mybatch;
- mutex_enter(&zilog->zl_lock);
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return;
- seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+ /* move the async itxs for the foid to the sync queues */
+ zil_async_to_sync(zilog, foid);
+ mutex_enter(&zilog->zl_lock);
+ mybatch = zilog->zl_next_batch;
while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- if (seq < zilog->zl_commit_seq) {
+ cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
+ if (mybatch <= zilog->zl_com_batch) {
mutex_exit(&zilog->zl_lock);
return;
}
}
- zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
- /* wake up others waiting on the commit */
- cv_broadcast(&zilog->zl_cv_writer);
+
+ zilog->zl_next_batch++;
+ zilog->zl_writer = B_TRUE;
+ zil_commit_writer(zilog);
+ zilog->zl_com_batch = mybatch;
+ zilog->zl_writer = B_FALSE;
mutex_exit(&zilog->zl_lock);
+
+ /* wake up one thread to become the next writer */
+ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
+
+ /* wake up all threads waiting for this batch to be committed */
+ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
}
/*
@@ -1217,6 +1536,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
zil_header_t *zh = zil_header_in_syncing_context(zilog);
uint64_t txg = dmu_tx_get_txg(tx);
spa_t *spa = zilog->zl_spa;
+ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
lwb_t *lwb;
/*
@@ -1230,7 +1550,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
ASSERT(zilog->zl_stop_sync == 0);
- zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
+ if (*replayed_seq != 0) {
+ ASSERT(zh->zh_replay_seq < *replayed_seq);
+ zh->zh_replay_seq = *replayed_seq;
+ *replayed_seq = 0;
+ }
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
@@ -1259,7 +1583,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
- zio_free_blk(spa, &lwb->lwb_blk, txg);
+ zio_free_zil(spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
/*
@@ -1287,6 +1611,18 @@ zil_fini(void)
kmem_cache_destroy(zil_lwb_cache);
}
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+ zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+ zilog->zl_logbias = logbias;
+}
+
zilog_t *
zil_alloc(objset_t *os, zil_header_t *zh_phys)
{
@@ -1299,15 +1635,23 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_spa = dmu_objset_spa(os);
zilog->zl_dmu_pool = dmu_objset_pool(os);
zilog->zl_destroy_txg = TXG_INITIAL - 1;
+ zilog->zl_logbias = dmu_objset_logbias(os);
+ zilog->zl_sync = dmu_objset_syncprop(os);
+ zilog->zl_next_batch = 1;
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zilog->zl_itx_list, sizeof (itx_t),
- offsetof(itx_t, itx_node));
+ for (int i = 0; i < TXG_SIZE; i++) {
+ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
offsetof(lwb_t, lwb_node));
+ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
@@ -1315,6 +1659,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
return (zilog);
}
@@ -1322,27 +1668,47 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
void
zil_free(zilog_t *zilog)
{
- lwb_t *lwb;
+ lwb_t *head_lwb;
zilog->zl_stop_sync = 1;
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- list_remove(&zilog->zl_lwb_list, lwb);
- if (lwb->lwb_buf != NULL)
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- kmem_cache_free(zil_lwb_cache, lwb);
+ /*
+ * After zil_close() there should only be one lwb with a buffer.
+ */
+ head_lwb = list_head(&zilog->zl_lwb_list);
+ if (head_lwb) {
+ ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
+ list_remove(&zilog->zl_lwb_list, head_lwb);
+ zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, head_lwb);
}
list_destroy(&zilog->zl_lwb_list);
avl_destroy(&zilog->zl_vdev_tree);
mutex_destroy(&zilog->zl_vdev_lock);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
- list_destroy(&zilog->zl_itx_list);
+ ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+ list_destroy(&zilog->zl_itx_commit_list);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * It's possible for an itx to be generated that doesn't dirty
+ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+ * callback to remove the entry. We remove those here.
+ *
+ * Also free up the ziltest itxs.
+ */
+ if (zilog->zl_itxg[i].itxg_itxs)
+ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+ mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+ }
+
mutex_destroy(&zilog->zl_lock);
cv_destroy(&zilog->zl_cv_writer);
cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_cv_batch[0]);
+ cv_destroy(&zilog->zl_cv_batch[1]);
kmem_free(zilog, sizeof (zilog_t));
}
@@ -1368,26 +1734,28 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
void
zil_close(zilog_t *zilog)
{
+ lwb_t *tail_lwb;
+ uint64_t txg = 0;
+
+ zil_commit(zilog, 0); /* commit all itx */
+
/*
- * If the log isn't already committed, mark the objset dirty
- * (so zil_sync() will be called) and wait for that txg to sync.
+ * The lwb_max_txg for the stubby lwb will reflect the last activity
+ * for the zil. After a txg_wait_synced() on the txg we know all the
+ * callbacks have occurred that may clean the zil. Only then can we
+ * destroy the zl_clean_taskq.
*/
- if (!zil_is_committed(zilog)) {
- uint64_t txg;
- dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
+ mutex_enter(&zilog->zl_lock);
+ tail_lwb = list_tail(&zilog->zl_lwb_list);
+ if (tail_lwb != NULL)
+ txg = tail_lwb->lwb_max_txg;
+ mutex_exit(&zilog->zl_lock);
+ if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- }
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
zilog->zl_get_data = NULL;
-
- zil_itx_clean(zilog);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
}
/*
@@ -1419,15 +1787,7 @@ zil_suspend(zilog_t *zilog)
zilog->zl_suspending = B_TRUE;
mutex_exit(&zilog->zl_lock);
- zil_commit(zilog, UINT64_MAX, 0);
-
- /*
- * Wait for any in-flight log writes to complete.
- */
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- mutex_exit(&zilog->zl_lock);
+ zil_commit(zilog, 0);
zil_destroy(zilog, B_FALSE);
@@ -1448,102 +1808,89 @@ zil_resume(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
}
-/*
- * Read in the data for the dmu_sync()ed block, and change the log
- * record to write this whole block.
- */
-void
-zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
-{
- blkptr_t *wbp = &lr->lr_blkptr;
- char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t blksz;
-
- if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- /*
- * If the blksz is zero then we must be replaying a log
- * from an version prior to setting the blksize of null blocks.
- * So we just zero the actual write size reqeusted.
- */
- if (blksz == 0) {
- bzero(wbuf, lr->lr_length);
- return;
- }
- bzero(wbuf, blksz);
- } else {
- /*
- * A subsequent write may have overwritten this block, in which
- * case wbp may have been been freed and reallocated, and our
- * read of wbp may fail with a checksum error. We can safely
- * ignore this because the later write will provide the
- * correct data.
- */
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lr->lr_foid;
- zb.zb_level = 0;
- zb.zb_blkid = -1; /* unknown */
-
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
- NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
- }
- lr->lr_offset -= lr->lr_offset % blksz;
- lr->lr_length = blksz;
-}
-
typedef struct zil_replay_arg {
- objset_t *zr_os;
zil_replay_func_t **zr_replay;
void *zr_arg;
boolean_t zr_byteswap;
- char *zr_lrbuf;
+ char *zr_lr;
} zil_replay_arg_t;
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+ char name[MAXNAMELEN];
+
+ zilog->zl_replaying_seq--; /* didn't actually replay this one */
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+ (u_longlong_t)lr->lrc_seq,
+ (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+ return (error);
+}
+
+static int
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
zil_replay_arg_t *zr = zra;
const zil_header_t *zh = zilog->zl_header;
uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype;
- char *name;
- int pass, error;
+ int error = 0;
- if (!zilog->zl_replay) /* giving up */
- return;
-
- if (lr->lrc_txg < claim_txg) /* already committed */
- return;
+ zilog->zl_replaying_seq = lr->lrc_seq;
if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
- return;
+ return (0);
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return (0);
/* Strip case-insensitive bit, still present in log record */
txtype &= ~TX_CI;
- if (txtype == 0 || txtype >= TX_MAX_TYPE) {
- error = EINVAL;
- goto bad;
+ if (txtype == 0 || txtype >= TX_MAX_TYPE)
+ return (zil_replay_error(zilog, lr, EINVAL));
+
+ /*
+ * If this record type can be logged out of order, the object
+ * (lr_foid) may no longer exist. That's legitimate, not an error.
+ */
+ if (TX_OOO(txtype)) {
+ error = dmu_object_info(zilog->zl_os,
+ ((lr_ooo_t *)lr)->lr_foid, NULL);
+ if (error == ENOENT || error == EEXIST)
+ return (0);
}
/*
* Make a copy of the data so we can revise and extend it.
*/
- bcopy(lr, zr->zr_lrbuf, reclen);
+ bcopy(lr, zr->zr_lr, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ error = zil_read_log_data(zilog, (lr_write_t *)lr,
+ zr->zr_lr + reclen);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
+ }
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
- * However, the log is a mix of different data types, and only the
+ * However, the log is a mix of different record types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
- byteswap_uint64_array(zr->zr_lrbuf, reclen);
+ byteswap_uint64_array(zr->zr_lr, reclen);
/*
* We must now do two things atomically: replay this log record,
@@ -1551,42 +1898,30 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
- for (pass = 1; pass <= 2; pass++) {
- zilog->zl_replaying_seq = lr->lrc_seq;
- /* Only byteswap (if needed) on the 1st pass. */
- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
- zr->zr_byteswap && pass == 1);
-
- if (!error)
- return;
-
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+ if (error) {
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
- * any removes then retry the transaction.
+ * any removes then retry the transaction. Note that we
+ * specify B_FALSE for byteswap now, so we don't do it twice.
*/
- if (pass == 1)
- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
}
-
-bad:
- ASSERT(error);
- name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- dmu_objset_name(zr->zr_os, name);
- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
- "dataset %s, seq 0x%llx, txtype %llu %s\n",
- error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
- (lr->lrc_txtype & TX_CI) ? "CI" : "");
- zilog->zl_replay = B_FALSE;
- kmem_free(name, MAXNAMELEN);
+ return (0);
}
/* ARGSUSED */
-static void
+static int
zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zilog->zl_replay_blks++;
+
+ return (0);
}
/*
@@ -1605,11 +1940,10 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
}
//printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
- zr.zr_os = os;
zr.zr_replay = replay_func;
zr.zr_arg = arg;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
- zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+ zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
/*
* Wait for in-progress removes to sync before starting replay.
@@ -1617,11 +1951,11 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
txg_wait_synced(zilog->zl_dmu_pool, 0);
zilog->zl_replay = B_TRUE;
- zilog->zl_replay_time = LBOLT;
+ zilog->zl_replay_time = ddi_get_lbolt();
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
- kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+ kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
@@ -1629,58 +1963,31 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
}
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
{
- lwb_t *lwb;
- int ret;
-
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
- /* recent unpushed intent log transactions? */
- if (!list_is_empty(&zilog->zl_itx_list)) {
- ret = B_FALSE;
- goto out;
- }
-
- /* intent log never used? */
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- ret = B_TRUE;
- goto out;
- }
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return (B_TRUE);
- /*
- * more than 1 log buffer means zil_sync() hasn't yet freed
- * entries after a txg has committed
- */
- if (list_next(&zilog->zl_lwb_list, lwb)) {
- ret = B_FALSE;
- goto out;
+ if (zilog->zl_replay) {
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+ zilog->zl_replaying_seq;
+ return (B_TRUE);
}
- ASSERT(zil_empty(zilog));
- ret = B_TRUE;
-out:
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
- return (ret);
+ return (B_FALSE);
}
/* ARGSUSED */
int
-zil_vdev_offline(char *osname, void *arg)
+zil_vdev_offline(const char *osname, void *arg)
{
objset_t *os;
zilog_t *zilog;
int error;
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ error = dmu_objset_hold(osname, FTAG, &os);
if (error)
return (error);
@@ -1689,6 +1996,6 @@ zil_vdev_offline(char *osname, void *arg)
error = EEXIST;
else
zil_resume(zilog);
- dmu_objset_close(os);
+ dmu_objset_rele(os, FTAG);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 32f15e90d4fa..5e968b5c29b9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -32,6 +31,9 @@
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -57,6 +59,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
6, /* ZIO_PRIORITY_ASYNC_READ */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
+ 2, /* ZIO_PRIORITY_DDT_PREFETCH */
};
/*
@@ -69,10 +72,6 @@ char *zio_type_name[ZIO_TYPES] = {
"zio_ioctl"
};
-#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
-#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
-#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
-
/*
* ==========================================================================
* I/O kmem caches
@@ -91,8 +90,15 @@ extern vmem_t *zio_alloc_arena;
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
-#define IO_IS_ALLOCATING(zio) \
- ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
void
zio_init(void)
@@ -113,6 +119,7 @@ zio_init(void)
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
size_t p2 = size;
size_t align = 0;
+ size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
while (p2 & (p2 - 1))
p2 &= p2 - 1;
@@ -129,11 +136,17 @@ zio_init(void)
char name[36];
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL, cflags);
+ /*
+ * Since zio_data bufs do not appear in crash dumps, we
+ * pass KMC_NOTOUCH so that no allocator metadata is
+ * stored with the buffers.
+ */
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL,
+ cflags | KMC_NOTOUCH);
}
}
@@ -280,7 +293,8 @@ zio_pop_transforms(zio_t *zio)
zt->zt_transform(zio,
zt->zt_orig_data, zt->zt_orig_size);
- zio_buf_free(zio->io_data, zt->zt_bufsize);
+ if (zt->zt_bufsize != 0)
+ zio_buf_free(zio->io_data, zt->zt_bufsize);
zio->io_data = zt->zt_orig_data;
zio->io_size = zt->zt_orig_size;
@@ -309,7 +323,7 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
{
if (zio->io_error == 0 &&
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
- zio->io_data, zio->io_size, data, size) != 0)
+ zio->io_data, data, zio->io_size, size) != 0)
zio->io_error = EIO;
}
@@ -394,6 +408,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
+ pio->io_child_count++;
+ cio->io_parent_count++;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
}
@@ -410,6 +427,9 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
@@ -425,7 +445,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
mutex_enter(&zio->io_lock);
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
- zio->io_stage--;
+ zio->io_stage >>= 1;
zio->io_stall = countp;
waiting = B_TRUE;
}
@@ -467,10 +487,11 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
* ==========================================================================
*/
static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
- const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+ zio_type_t type, int priority, enum zio_flag flags,
+ vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+ enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
@@ -497,14 +518,17 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
zio->io_child_type = ZIO_CHILD_GANG;
+ else if (flags & ZIO_FLAG_DDT_CHILD)
+ zio->io_child_type = ZIO_CHILD_DDT;
else
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
- zio->io_bp = bp;
+ zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- if (type != ZIO_TYPE_WRITE)
+ if (type != ZIO_TYPE_WRITE ||
+ zio->io_child_type == ZIO_CHILD_DDT)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
@@ -514,14 +538,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_spa = spa;
zio->io_txg = txg;
- zio->io_data = data;
- zio->io_size = size;
zio->io_done = done;
zio->io_private = private;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
+ zio->io_orig_data = zio->io_data = data;
+ zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -555,7 +579,7 @@ zio_destroy(zio_t *zio)
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
- void *private, int flags)
+ void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -567,7 +591,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
}
zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
{
return (zio_null(NULL, spa, NULL, done, private, flags));
}
@@ -575,23 +599,24 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
- zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
data, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
return (zio);
}
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
@@ -601,13 +626,15 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
zp->zp_type < DMU_OT_NUMTYPES &&
zp->zp_level < 32 &&
- zp->zp_ndvas > 0 &&
- zp->zp_ndvas <= spa_max_replication(spa));
- ASSERT(ready != NULL);
+ zp->zp_copies > 0 &&
+ zp->zp_copies <= spa_max_replication(spa) &&
+ zp->zp_dedup <= 1 &&
+ zp->zp_dedup_verify <= 1);
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
zio->io_ready = ready;
zio->io_prop = *zp;
@@ -618,7 +645,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ enum zio_flag flags, zbookmark_t *zb)
{
zio_t *zio;
@@ -629,33 +656,47 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
return (zio);
}
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+ zio->io_prop.zp_copies = copies;
+ zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ enum zio_flag flags)
{
zio_t *zio;
- ASSERT(!BP_IS_HOLE(bp));
-
- if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
+ dprintf_bp(bp, "freeing in txg %llu, pass %u",
+ (longlong_t)txg, spa->spa_sync_pass);
- if (txg == spa->spa_syncing_txg &&
- spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
- }
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(spa_syncing_txg(spa) == txg);
+ ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
return (zio);
}
zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -669,9 +710,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
+ * If txg == 0 we just verify that the block is claimable.
*/
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
- ASSERT3U(spa_first_txg(spa), <=, txg);
+ ASSERT(txg == spa_first_txg(spa) || txg == 0);
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -682,7 +725,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags)
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
{
zio_t *zio;
int c;
@@ -707,7 +750,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -728,7 +771,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -743,9 +786,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio->io_prop.zp_checksum = checksum;
- if (zio_checksum_table[checksum].ci_zbt) {
+ if (zio_checksum_table[checksum].ci_eck) {
/*
- * zbt checksums are necessarily destructive -- they modify
+ * zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
@@ -763,10 +806,10 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority, int flags,
+ void *data, uint64_t size, int type, int priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
- uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
ASSERT(vd->vdev_parent ==
@@ -779,26 +822,33 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
- pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
- pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
if (vd->vdev_children == 0)
offset += VDEV_LABEL_START_SIZE;
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+ /*
+ * If we've decided to do a repair, the write is not speculative --
+ * even if the original read was.
+ */
+ if (flags & ZIO_FLAG_IO_REPAIR)
+ flags &= ~ZIO_FLAG_SPECULATIVE;
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
- done, private, type, priority,
- (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
- vd, offset, &pio->io_bookmark,
- ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+ done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+ ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
- int type, int priority, int flags, zio_done_func_t *done, void *private)
+ int type, int priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -808,7 +858,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
data, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
vd, offset, NULL,
- ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
return (zio);
}
@@ -821,6 +871,23 @@ zio_flush(zio_t *zio, vdev_t *vd)
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+ ASSERT(zio->io_executor == NULL);
+ ASSERT(zio->io_orig_size == zio->io_size);
+ ASSERT(size <= zio->io_size);
+
+ /*
+ * We don't shrink for raidz because of problems with the
+ * reconstruction when reading back less than the block size.
+ * Note, BP_IS_RAIDZ() assumes no compression.
+ */
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ if (!BP_IS_RAIDZ(zio->io_bp))
+ zio->io_orig_size = zio->io_size = size;
+}
+
/*
* ==========================================================================
* Prepare to read and write logical blocks
@@ -835,28 +902,33 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t csize = BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(csize);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(psize);
- zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+ zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_bp_init(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_prop_t *zp = &zio->io_prop;
- int compress = zp->zp_compress;
+ enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
- void *cbuf;
uint64_t lsize = zio->io_size;
- uint64_t csize = lsize;
- uint64_t cbufsize = 0;
+ uint64_t psize = lsize;
int pass = 1;
/*
@@ -870,7 +942,29 @@ zio_write_bp_init(zio_t *zio)
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
- ASSERT(compress != ZIO_COMPRESS_INHERIT);
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+ if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth != zio->io_txg);
+ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+ *bp = *zio->io_bp_override;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+ zp->zp_dedup_verify);
+
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+ BP_SET_DEDUP(bp, 1);
+ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ }
if (bp->blk_birth == zio->io_txg) {
/*
@@ -882,22 +976,29 @@ zio_write_bp_init(zio_t *zio)
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
- pass = spa_sync_pass(zio->io_spa);
+ pass = spa_sync_pass(spa);
+
+ ASSERT(zio->io_txg == spa_syncing_txg(spa));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!BP_GET_DEDUP(bp));
if (pass > SYNC_PASS_DONT_COMPRESS)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
- spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+ ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
if (compress != ZIO_COMPRESS_OFF) {
- if (!zio_compress_data(compress, zio->io_data, zio->io_size,
- &cbuf, &csize, &cbufsize)) {
+ void *cbuf = zio_buf_alloc(lsize);
+ psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
- } else if (csize != 0) {
- zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+ zio_buf_free(cbuf, lsize);
+ } else {
+ ASSERT(psize < lsize);
+ zio_push_transform(zio, cbuf, psize, lsize, NULL);
}
}
@@ -909,10 +1010,10 @@ zio_write_bp_init(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
pass > SYNC_PASS_REWRITE) {
- ASSERT(csize != 0);
- uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+ ASSERT(psize != 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -920,17 +1021,36 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline = ZIO_WRITE_PIPELINE;
}
- if (csize == 0) {
+ if (psize == 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
- BP_SET_PSIZE(bp, csize);
+ BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ if (zp->zp_dedup) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ if (BP_GET_DEDUP(bp))
+ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
}
return (ZIO_PIPELINE_CONTINUE);
@@ -943,10 +1063,11 @@ zio_write_bp_init(zio_t *zio)
*/
static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
+ int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
#ifdef _KERNEL
struct ostask *task;
#endif
@@ -984,10 +1105,10 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
#ifdef _KERNEL
(void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, task);
+ (task_func_t *)zio_execute, zio, flags, task);
#else
(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, TQ_SLEEP);
+ (task_func_t *)zio_execute, zio, flags);
#endif
}
@@ -1007,7 +1128,7 @@ zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
static int
zio_issue_async(zio_t *zio)
{
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
@@ -1015,7 +1136,7 @@ zio_issue_async(zio_t *zio)
void
zio_interrupt(zio_t *zio)
{
- zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
/*
@@ -1031,7 +1152,7 @@ zio_interrupt(zio_t *zio)
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
void
zio_execute(zio_t *zio)
@@ -1039,32 +1160,39 @@ zio_execute(zio_t *zio)
zio->io_executor = curthread;
while (zio->io_stage < ZIO_STAGE_DONE) {
- uint32_t pipeline = zio->io_pipeline;
- zio_stage_t stage = zio->io_stage;
+ enum zio_stage pipeline = zio->io_pipeline;
+ enum zio_stage stage = zio->io_stage;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(ISP2(stage));
+ ASSERT(zio->io_stall == NULL);
- while (((1U << ++stage) & pipeline) == 0)
- continue;
+ do {
+ stage <<= 1;
+ } while ((stage & pipeline) == 0);
ASSERT(stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stall == NULL);
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
- * issue async to avoid deadlock.
+ * or may wait for an I/O that needs an interrupt thread
+ * to complete, issue async to avoid deadlock.
+ *
+ * For VDEV_IO_START, we cut in line so that the io will
+ * be sent to disk promptly.
*/
- if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
- zio->io_vd == NULL &&
+ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
return;
}
zio->io_stage = stage;
- rv = zio_pipeline[stage](zio);
+ rv = zio_pipeline[highbit(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
return;
@@ -1147,19 +1275,8 @@ zio_reexecute(zio_t *pio)
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
- if (IO_IS_ALLOCATING(pio)) {
- /*
- * Remember the failed bp so that the io_ready() callback
- * can update its accounting upon reexecution. The block
- * was already freed in zio_done(); we indicate this with
- * a fill count of -1 so that zio_free() knows to skip it.
- */
- blkptr_t *bp = pio->io_bp;
- ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
- bp->blk_fill = BLK_FILL_ALREADY_FREED;
- pio->io_bp_orig = *bp;
- BP_ZERO(bp);
- }
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
/*
* As we reexecute pio's children, new children could be created.
@@ -1347,6 +1464,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp));
}
+ /*
+ * If we are here to damage data for testing purposes,
+ * leave the GBH alone so that we can detect the damage.
+ */
+ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1360,8 +1483,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
{
- return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
- NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+ return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
@@ -1445,7 +1568,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
@@ -1455,7 +1578,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
ASSERT(zio->io_data == gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1482,7 +1605,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1551,9 +1674,9 @@ zio_write_gang_member_ready(zio_t *zio)
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
- ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
- ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
@@ -1578,13 +1701,13 @@ zio_write_gang_block(zio_t *pio)
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
- int ndvas = gio->io_prop.zp_ndvas;
- int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
int error;
- error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+ error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
if (error) {
pio->io_error = error;
@@ -1620,7 +1743,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
- zp.zp_ndvas = gio->io_prop.zp_ndvas;
+ zp.zp_copies = gio->io_prop.zp_copies;
+ zp.zp_dedup = 0;
+ zp.zp_dedup_verify = 0;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1641,15 +1766,383 @@ zio_write_gang_block(zio_t *pio)
/*
* ==========================================================================
- * Allocate and free blocks
+ * Dedup
* ==========================================================================
*/
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+ if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+ dde->dde_repair_data = zio->io_data;
+ else
+ zio_buf_free(zio->io_data, zio->io_size);
+ mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio_ddt_child_read_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+ &zio->io_bookmark));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
static int
+zio_ddt_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ return (ZIO_PIPELINE_STOP);
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ return (ZIO_PIPELINE_STOP);
+ }
+ if (dde->dde_repair_data != NULL) {
+ bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ */
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ bcmp(zio->io_orig_data, lio->io_orig_data,
+ zio->io_orig_size) != 0);
+ }
+ }
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ uint32_t aflags = ARC_WAIT;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ ddt_exit(ddt);
+
+ error = arc_read_nolock(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (arc_buf_size(abuf) != zio->io_orig_size ||
+ bcmp(abuf->b_data, zio->io_orig_data,
+ zio->io_orig_size) != 0)
+ error = EEXIST;
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ while ((pio = zio_walk_parents(zio)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ while (zio_walk_parents(zio) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+ int p = DDT_PHYS_DITTO;
+ zio_prop_t *zp = &zio->io_prop;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_key_t *ddk = &dde->dde_key;
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ if (ddp->ddp_phys_birth != 0)
+ ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ ddt_phys_fill(ddp, bp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ int ditto_copies;
+ zio_t *cio = NULL;
+ zio_t *dio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = 0;
+ }
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+ if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ zio_prop_t czp = *zp;
+
+ czp.zp_copies = ditto_copies;
+
+ /*
+ * If we arrived here with an override bp, we won't have run
+ * the transform stack, so we won't have the data we need to
+ * generate a child i/o. So, toss the override bp and restart.
+ * This is safe, because using the override bp is just an
+ * optimization; and it's rare, so the cost doesn't matter.
+ */
+ if (zio->io_bp_override) {
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, &czp, NULL,
+ zio_ddt_ditto_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ if (cio)
+ zio_nowait(cio);
+ if (dio)
+ zio_nowait(dio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ ddt_exit(ddt);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa->spa_normal_class;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
@@ -1660,12 +2153,12 @@ zio_dva_allocate(zio_t *zio)
ASSERT(BP_IS_HOLE(bp));
ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+ ASSERT3U(zio->io_prop.zp_copies, >, 0);
+ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
if (error) {
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1704,36 +2197,11 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- spa_t *spa = zio->io_spa;
- boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
- if (zio->io_bp == bp && !now) {
- /*
- * This is a rewrite for sync-to-convergence.
- * We can't do a metaslab_free(NOW) because bp wasn't allocated
- * during this sync pass, which means that metaslab_sync()
- * already committed the allocation.
- */
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
- BP_IDENTITY(&zio->io_bp_orig)));
- ASSERT(spa_sync_pass(spa) > 1);
-
- if (BP_IS_GANG(bp) && gn == NULL) {
- /*
- * This is a gang leader whose gang header(s) we
- * couldn't read now, so defer the free until later.
- * The block should still be intact because without
- * the headers, we'd never even start the rewrite.
- */
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return;
- }
- }
+ ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
- metaslab_free(spa, bp, bp->blk_birth, now);
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1747,25 +2215,31 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t txg)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t size, boolean_t use_slog)
{
- int error;
+ int error = 1;
- error = metaslab_alloc(spa, spa->spa_log_class, size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ ASSERT(txg > spa_syncing_txg(spa));
+
+ if (use_slog)
+ error = metaslab_alloc(spa, spa_log_class(spa), size,
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error)
- error = metaslab_alloc(spa, spa->spa_normal_class, size,
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_CHECKSUM(new_bp,
+ spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_LEVEL(new_bp, 0);
+ BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
}
@@ -1773,15 +2247,15 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
}
/*
- * Free an intent log block. We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
*/
void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
{
+ ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
ASSERT(!BP_IS_GANG(bp));
- metaslab_free(spa, bp, txg, B_FALSE);
+ zio_free(spa, txg, bp);
}
/*
@@ -1809,6 +2283,26 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ /*
+ * We keep track of time-sensitive I/Os so that the scan thread
+ * can quickly react to certain workloads. In particular, we care
+ * about non-scrubbing, top-level reads and writes with the following
+ * characteristics:
+ * - synchronous writes of user data to non-slog devices
+ * - any reads of user data
+ * When these conditions are met, adjust the timestamp of spa_last_io
+ * which allows the scan thread to adjust its workload accordingly.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+ vd == vd->vdev_top && !vd->vdev_islog &&
+ zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+ zio->io_txg != spa_syncing_txg(spa)) {
+ uint64_t old = spa->spa_last_io;
+ uint64_t new = ddi_get_lbolt64();
+ if (old != new)
+ (void) atomic_cas_64(&spa->spa_last_io, old, new);
+ }
+
align = 1ULL << vd->vdev_top->vdev_ashift;
if (P2PHASE(zio->io_size, align) != 0) {
@@ -1824,7 +2318,7 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -1910,6 +2404,32 @@ zio_vdev_io_done(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+ const void *good_buf)
+{
+ /* no processing needed */
+ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+ void *buf = zio_buf_alloc(zio->io_size);
+
+ bcopy(zio->io_data, buf, zio->io_size);
+
+ zcr->zcr_cbinfo = zio->io_size;
+ zcr->zcr_cbdata = buf;
+ zcr->zcr_finish = zio_vsd_default_cksum_finish;
+ zcr->zcr_free = zio_buf_free;
+}
+
static int
zio_vdev_io_assess(zio_t *zio)
{
@@ -1922,7 +2442,7 @@ zio_vdev_io_assess(zio_t *zio)
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
if (zio->io_vsd != NULL) {
- zio->io_vsd_free(zio);
+ zio->io_vsd_ops->vsd_free(zio);
zio->io_vsd = NULL;
}
@@ -1931,6 +2451,9 @@ zio_vdev_io_assess(zio_t *zio)
/*
* If the I/O failed, determine whether we should attempt to retry it.
+ *
+ * On retry, we cut in line in the issue queue, since we don't want
+ * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
if (zio->io_error && vd == NULL &&
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
@@ -1939,8 +2462,9 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
- zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
- zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+ zio_requeue_io_start_cut_in_line);
return (ZIO_PIPELINE_STOP);
}
@@ -1972,7 +2496,7 @@ zio_vdev_io_reissue(zio_t *zio)
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
ASSERT(zio->io_error == 0);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1980,7 +2504,7 @@ zio_vdev_io_redone(zio_t *zio)
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1990,7 +2514,7 @@ zio_vdev_io_bypass(zio_t *zio)
ASSERT(zio->io_error == 0);
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
- zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
}
/*
@@ -2032,9 +2556,12 @@ zio_checksum_generate(zio_t *zio)
static int
zio_checksum_verify(zio_t *zio)
{
+ zio_bad_cksum_t info;
blkptr_t *bp = zio->io_bp;
int error;
+ ASSERT(zio->io_vd != NULL);
+
if (bp == NULL) {
/*
* This is zio_read_phys().
@@ -2046,11 +2573,12 @@ zio_checksum_verify(zio_t *zio)
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
}
- if ((error = zio_checksum_error(zio)) != 0) {
+ if ((error = zio_checksum_error(zio, &info)) != 0) {
zio->io_error = error;
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, zio->io_vd, zio, 0, 0);
+ zfs_ereport_start_checksum(zio->io_spa,
+ zio->io_vd, zio, zio->io_offset,
+ zio->io_size, NULL, &info);
}
}
@@ -2063,7 +2591,7 @@ zio_checksum_verify(zio_t *zio)
void
zio_checksum_verified(zio_t *zio)
{
- zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/*
@@ -2103,7 +2631,8 @@ zio_ready(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
return (ZIO_PIPELINE_STOP);
if (zio->io_ready) {
@@ -2137,6 +2666,19 @@ zio_ready(zio_t *zio)
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (BP_IS_GANG(bp)) {
+ zio->io_flags &= ~ZIO_FLAG_NODATA;
+ } else {
+ ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ }
+ }
+
+ if (zio_injection_enabled &&
+ zio->io_spa->spa_syncing_txg == zio->io_txg)
+ zio_handle_ignored_writes(zio);
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -2156,6 +2698,7 @@ zio_done(zio_t *zio)
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
@@ -2166,23 +2709,51 @@ zio_done(zio_t *zio)
if (bp != NULL) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
- ASSERT(bp->blk_pad[2] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
(bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ zio->io_bp_override == NULL &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp));
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
}
/*
- * If there were child vdev or gang errors, they apply to us now.
+ * If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+ zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+ /*
+ * If the I/O on the transformed data was successful, generate any
+ * checksum reports now while we still have the transformed data.
+ */
+ if (zio->io_error == 0) {
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ uint64_t align = zcr->zcr_align;
+ uint64_t asize = P2ROUNDUP(psize, align);
+ char *abuf = zio->io_data;
+
+ if (asize != psize) {
+ abuf = zio_buf_alloc(asize);
+ bcopy(zio->io_data, abuf, psize);
+ bzero(abuf + psize, asize - psize);
+ }
+
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, abuf);
+ zfs_ereport_free_checksum(zcr);
+
+ if (asize != psize)
+ zio_buf_free(abuf, asize);
+ }
+ }
zio_pop_transforms(zio); /* note: may set zio->io_error */
@@ -2198,8 +2769,9 @@ zio_done(zio_t *zio)
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
- if ((zio->io_error == EIO ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+ if ((zio->io_error == EIO || !(zio->io_flags &
+ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
@@ -2216,22 +2788,34 @@ zio_done(zio_t *zio)
* propagate all the way to the root via zio_notify_parent().
*/
ASSERT(vd == NULL && bp != NULL);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
- if (IO_IS_ALLOCATING(zio))
+ if (IO_IS_ALLOCATING(zio) &&
+ !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ }
if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) &&
+ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
zio->io_error == ENXIO &&
- spa->spa_load_state == SPA_LOAD_NONE &&
+ spa_load_state(spa) == SPA_LOAD_NONE &&
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ /*
+ * Here is a possibly good place to attempt to do
+ * either combinatorial reconstruction or error correction
+ * based on checksums. It also might be a good place
+ * to send out preliminary ereports before we suspend
+ * processing.
+ */
}
/*
@@ -2242,11 +2826,10 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
- zio->io_child_type == ZIO_CHILD_LOGICAL) {
- ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+ if ((zio->io_error || zio->io_reexecute) &&
+ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+ !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
- }
zio_gang_tree_free(&zio->io_gang_tree);
@@ -2320,22 +2903,33 @@ zio_done(zio_t *zio)
#ifdef _KERNEL
(void) taskq_dispatch_safe(
spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio,
- &zio->io_task_issue);
+ (task_func_t *)zio_reexecute, zio, TQ_SLEEP,
+ &zio->io_task_issue);
#else
(void) taskq_dispatch(
spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+ (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
#endif
}
return (ZIO_PIPELINE_STOP);
}
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
+ * Report any checksum errors, since the I/O is complete.
+ */
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, NULL);
+ zfs_ereport_free_checksum(zcr);
+ }
+
+ /*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
@@ -2371,12 +2965,17 @@ zio_done(zio_t *zio)
* I/O pipeline definition
* ==========================================================================
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
- zio_issue_async,
zio_read_bp_init,
+ zio_free_bp_init,
+ zio_issue_async,
zio_write_bp_init,
zio_checksum_generate,
+ zio_ddt_read_start,
+ zio_ddt_read_done,
+ zio_ddt_write,
+ zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_allocate,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
index bf7fe733fe0c..c8fe20f2eb4e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -19,14 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <zfs_fletcher.h>
/*
* Checksum vectors.
@@ -49,13 +50,13 @@
* we want the ability to take advantage of that hardware.
*
* Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
*
* When writing a block, we always checksum it with the latest-and-greatest
* checksum function of the appropriate strength. When reading a block,
* we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
*/
/*ARGSUSED*/
@@ -66,19 +67,20 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+ {{NULL, NULL}, 0, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
};
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
{
ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +95,29 @@ zio_checksum_select(uint8_t child, uint8_t parent)
return (child);
}
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+ enum zio_checksum parent)
+{
+ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (spa_dedup_checksum(spa));
+
+ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+ ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+ return (child);
+}
+
/*
* Set the external verifier for a gang block based on <vdev, offset, txg>,
* a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +126,7 @@ static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
{
dva_t *dva = BP_IDENTITY(bp);
- uint64_t txg = bp->blk_birth;
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
@@ -128,47 +153,79 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t zbt_cksum;
+ zio_cksum_t cksum;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+
+ size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+ uint64_t);
+ eck = &zilc->zc_eck;
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
- zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+ zio_checksum_gang_verifier(&eck->zec_cksum, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
- zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+ zio_checksum_label_verifier(&eck->zec_cksum, offset);
else
- bp->blk_cksum = zbt->zbt_cksum;
- zbt->zbt_magic = ZBT_MAGIC;
- ci->ci_func[0](data, size, &zbt_cksum);
- zbt->zbt_cksum = zbt_cksum;
+ bp->blk_cksum = eck->zec_cksum;
+ eck->zec_magic = ZEC_MAGIC;
+ ci->ci_func[0](data, size, &cksum);
+ eck->zec_cksum = cksum;
} else {
ci->ci_func[0](data, size, &bp->blk_cksum);
}
}
int
-zio_checksum_error(zio_t *zio)
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int byteswap;
- void *data = zio->io_data;
+ int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ void *data = zio->io_data;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (EINVAL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+ uint64_t nused;
+
+ eck = &zilc->zc_eck;
+ if (eck->zec_magic == ZEC_MAGIC)
+ nused = zilc->zc_nused;
+ else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+ nused = BSWAP_64(zilc->zc_nused);
+ else
+ return (ECKSUM);
+
+ if (nused > size)
+ return (ECKSUM);
+
+ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
+
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
zio_checksum_gang_verifier(&verifier, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -176,15 +233,15 @@ zio_checksum_error(zio_t *zio)
else
verifier = bp->blk_cksum;
- byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = verifier;
+ expected_cksum = eck->zec_cksum;
+ eck->zec_cksum = verifier;
ci->ci_func[byteswap](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
+ eck->zec_cksum = expected_cksum;
if (byteswap)
byteswap_uint64_array(&expected_cksum,
@@ -196,11 +253,22 @@ zio_checksum_error(zio_t *zio)
ci->ci_func[byteswap](data, size, &actual_cksum);
}
+ info->zbc_expected = expected_cksum;
+ info->zbc_actual = actual_cksum;
+ info->zbc_checksum_name = ci->ci_name;
+ info->zbc_byteswapped = byteswap;
+ info->zbc_injected = 0;
+ info->zbc_has_cksum = 1;
+
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
return (ECKSUM);
- if (zio_injection_enabled && !zio->io_error)
- return (zio_handle_fault_injection(zio, ECKSUM));
+ if (zio_injection_enabled && !zio->io_error &&
+ (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
+
+ info->zbc_injected = 1;
+ return (error);
+ }
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
index c563be4eb955..f148977c4468 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/compress.h>
#include <sys/spa.h>
@@ -51,10 +49,11 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{gzip_compress, gzip_decompress, 7, "gzip-7"},
{gzip_compress, gzip_decompress, 8, "gzip-8"},
{gzip_compress, gzip_decompress, 9, "gzip-9"},
+ {zle_compress, zle_decompress, 64, "zle"},
};
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
{
ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@ zio_compress_select(uint8_t child, uint8_t parent)
return (child);
}
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
- uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- uint64_t ciosize, gapsize, destbufsize;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- char *dest;
- uint_t allzero;
+ size_t c_len, d_len, r_len;
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
- ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
/*
* If the data is all zeroes, we don't even need to allocate
- * a block for it. We indicate this by setting *destsizep = 0.
+ * a block for it. We indicate this by returning zero size.
*/
- allzero = 1;
- word = src;
- word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
- while (word < word_end) {
- if (*word++ != 0) {
- allzero = 0;
+ word_end = (uint64_t *)((char *)src + s_len);
+ for (word = src; word < word_end; word++)
+ if (*word != 0)
break;
- }
- }
- if (allzero) {
- *destp = NULL;
- *destsizep = 0;
- *destbufsizep = 0;
- return (1);
- }
- if (cpfunc == ZIO_COMPRESS_EMPTY)
+ if (word == word_end)
return (0);
+ if (c == ZIO_COMPRESS_EMPTY)
+ return (s_len);
+
/* Compress at least 12.5% */
- destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
- if (destbufsize == 0)
- return (0);
- dest = zio_buf_alloc(destbufsize);
+ d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+ if (d_len == 0)
+ return (s_len);
- ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
- (size_t)destbufsize, ci->ci_level);
- if (ciosize > destbufsize) {
- zio_buf_free(dest, destbufsize);
- return (0);
- }
+ c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
- /* Cool. We compressed at least as much as we were hoping to. */
+ if (c_len > d_len)
+ return (s_len);
- /* For security, make sure we don't write random heap crap to disk */
- gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
- if (gapsize != 0) {
- bzero(dest + ciosize, gapsize);
- ciosize += gapsize;
+ /*
+ * Cool. We compressed at least as much as we were hoping to.
+ * For both security and repeatability, pad out the last sector.
+ */
+ r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+ if (r_len > c_len) {
+ bzero((char *)dst + c_len, r_len - c_len);
+ c_len = r_len;
}
- ASSERT3U(ciosize, <=, destbufsize);
- ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
- *destp = dest;
- *destsizep = ciosize;
- *destbufsizep = destbufsize;
+ ASSERT3U(c_len, <=, d_len);
+ ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
- return (1);
+ return (c_len);
}
int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len)
{
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+ return (EINVAL);
- return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
index f8e6880c90f7..9ae7d1f697fd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -43,8 +42,8 @@
#include <sys/arc.h>
#include <sys/zio_impl.h>
#include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
#include <sys/fs/zfs.h>
uint32_t zio_injection_enabled;
@@ -70,8 +69,9 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
/*
* Check for a match against the MOS, which is based on type
*/
- if (zb->zb_objset == 0 && record->zi_objset == 0 &&
- record->zi_object == 0) {
+ if (zb->zb_objset == DMU_META_OBJSET &&
+ record->zi_objset == DMU_META_OBJSET &&
+ record->zi_object == DMU_META_DNODE_OBJECT) {
if (record->zi_type == DMU_OT_NONE ||
type == record->zi_type)
return (record->zi_freq == 0 ||
@@ -96,6 +96,31 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
}
/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_type == type &&
+ strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
*/
@@ -126,8 +151,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_spa != handler->zi_spa)
continue;
- /* Ignore device errors */
- if (handler->zi_record.zi_guid != 0)
+ /* Ignore device errors and panic injection */
+ if (handler->zi_record.zi_guid != 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
/* If this handler matches, return EIO */
@@ -159,7 +186,7 @@ zio_handle_label_injection(zio_t *zio, int error)
int label;
int ret = 0;
- if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+ if (offset >= VDEV_LABEL_START_SIZE &&
offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
return (0);
@@ -170,8 +197,10 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end;
- /* Ignore device only faults */
- if (handler->zi_record.zi_start == 0)
+ /* Ignore device only faults or panic injection */
+ if (handler->zi_record.zi_start == 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
/*
@@ -200,13 +229,30 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
inject_handler_t *handler;
int ret = 0;
+ /*
+ * We skip over faults in the labels unless it's during
+ * device open (i.e. zio == NULL).
+ */
+ if (zio != NULL) {
+ uint64_t offset = zio->io_offset;
+
+ if (offset < VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+ }
+
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore label specific faults */
- if (handler->zi_record.zi_start != 0)
+ /*
+ * Ignore label specific faults, panic injection
+ * or fake writes
+ */
+ if (handler->zi_record.zi_start != 0 ||
+ handler->zi_record.zi_func[0] != '\0' ||
+ handler->zi_record.zi_duration != 0)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -216,6 +262,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
continue;
}
+ /* Handle type specific I/O failures */
+ if (zio != NULL &&
+ handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
if (handler->zi_record.zi_error == error) {
/*
* For a failed open, pretend like the device
@@ -224,6 +276,16 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
if (error == ENXIO)
vd->vdev_stat.vs_aux =
VDEV_AUX_OPEN_FAILED;
+
+ /*
+ * Treat these errors as if they had been
+ * retried so that all the appropriate stats
+ * and FMA events are generated.
+ */
+ if (!handler->zi_record.zi_failfast &&
+ zio != NULL)
+ zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
ret = error;
break;
}
@@ -240,6 +302,84 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
}
/*
+ * Simulate hardware that ignores cache flushes. For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_duration == 0)
+ continue;
+
+ /*
+ * Positive duration implies # of seconds, negative
+ * a number of txgs
+ */
+ if (handler->zi_record.zi_timer == 0) {
+ if (handler->zi_record.zi_duration > 0)
+ handler->zi_record.zi_timer = ddi_get_lbolt64();
+ else
+ handler->zi_record.zi_timer = zio->io_txg;
+ }
+
+ /* Have a "problem" writing 60% of the time */
+ if (spa_get_random(100) < 60)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ break;
+ }
+
+ rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+ inject_handler_t *handler;
+
+ if (zio_injection_enabled == 0)
+ return;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_duration == 0)
+ continue;
+
+ if (handler->zi_record.zi_duration > 0) {
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer +
+ handler->zi_record.zi_duration * hz >
+ ddi_get_lbolt64());
+ } else {
+ /* duration is negative so the subtraction here adds */
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer -
+ handler->zi_record.zi_duration >=
+ spa_syncing_txg(spa));
+ }
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,
* which is the switch to trigger all fault injection.
@@ -336,7 +476,6 @@ int
zio_clear_fault(int id)
{
inject_handler_t *handler;
- int ret;
rw_enter(&inject_lock, RW_WRITER);
@@ -346,18 +485,18 @@ zio_clear_fault(int id)
break;
if (handler == NULL) {
- ret = ENOENT;
- } else {
- list_remove(&inject_handlers, handler);
- spa_inject_delref(handler->zi_spa);
- kmem_free(handler, sizeof (inject_handler_t));
- atomic_add_32(&zio_injection_enabled, -1);
- ret = 0;
+ rw_exit(&inject_lock);
+ return (ENOENT);
}
+ list_remove(&inject_handlers, handler);
rw_exit(&inject_lock);
- return (ret);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+
+ return (0);
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
new file mode 100644
index 000000000000..13c5673fbe26
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end - 1) {
+ uchar_t *first = src;
+ uchar_t *len = dst++;
+ if (src[0] == 0) {
+ uchar_t *last = src + (256 - n);
+ while (src < MIN(last, s_end) && src[0] == 0)
+ src++;
+ *len = src - first - 1 + n;
+ } else {
+ uchar_t *last = src + n;
+ if (d_end - dst < n)
+ break;
+ while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+ *dst++ = *src++;
+ if (src[0])
+ *dst++ = *src++;
+ *len = src - first - 1;
+ }
+ }
+ return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
new file mode 100644
index 000000000000..ec94b08555be
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED ((uint32_t)-1)
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == 0);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ ASSERT(zrl->zr_refcount >= 0);
+ zrl->zr_refcount++;
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT((int32_t)n > ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ int n = (int)zrl->zr_refcount;
+ return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 8f769e6f9a81..507f297674e5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -19,13 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
*/
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
+
+/* Portions Copyright 2010 Robert Milkowski */
/*
* ZFS volume emulation driver.
@@ -36,9 +36,13 @@
* /dev/zvol/dsk/<pool_name>/<dataset_name>
* /dev/zvol/rdsk/<pool_name>/<dataset_name>
*
- * These links are created by the ZFS-specific devfsadm link generator.
+ * These links are created by the /dev filesystem (sdev_zvolops.c).
* Volumes are persistent through reboot. No user command needs to be
* run before opening and using a device.
+ *
+ * FreeBSD notes.
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system.
*/
#include <sys/types.h>
@@ -77,8 +81,6 @@
#include "zfs_namecheck.h"
-#define ZVOL_DUMPSIZE "dumpsize"
-
struct g_class zfs_zvol_class = {
.name = "ZFS::ZVOL",
.version = G_VERSION,
@@ -86,13 +88,18 @@ struct g_class zfs_zvol_class = {
DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+void *zfsdev_state;
+static char *zvol_tag = "zvol_tag";
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
/*
- * This lock protects the zvol_state structure from being modified
+ * This lock protects the zfsdev_state structure from being modified
* while it's being used, e.g. an open that comes in before a create
* finishes. It also protects temporary opens of the dataset so that,
* e.g., an open doesn't get a spurious EBUSY.
*/
-static kmutex_t zvol_state_lock;
+kmutex_t zfsdev_state_lock;
static uint32_t zvol_minors;
typedef struct zvol_extent {
@@ -110,13 +117,13 @@ typedef struct zvol_state {
uint64_t zv_volblocksize; /* volume block size */
struct g_provider *zv_provider; /* GEOM provider */
uint8_t zv_min_bs; /* minimum addressable block shift */
- uint8_t zv_flags; /* readonly; dumpified */
+ uint8_t zv_flags; /* readonly, dumpified, etc. */
objset_t *zv_objset; /* objset handle */
- uint32_t zv_mode; /* DS_MODE_* flags at open time */
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
znode_t zv_znode; /* for range locking */
+ dmu_buf_t *zv_dbuf; /* bonus handle */
int zv_state;
struct bio_queue_head zv_queue;
struct mtx zv_queue_mtx; /* zv_queue mutex */
@@ -128,25 +135,45 @@ typedef struct zvol_state {
#define ZVOL_RDONLY 0x1
#define ZVOL_DUMPIFIED 0x2
#define ZVOL_EXCL 0x4
+#define ZVOL_WCE 0x8
/*
* zvol maximum transfer in one DMU tx.
*/
int zvol_maxphys = DMU_MAX_ACCESS/2;
-extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
+extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
+ nvlist_t *, nvlist_t **);
+static int zvol_remove_zv(zvol_state_t *);
static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
static int zvol_dumpify(zvol_state_t *zv);
static int zvol_dump_fini(zvol_state_t *zv);
static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
+static zvol_state_t *zvol_geom_create(const char *name);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+
static void
-zvol_size_changed(zvol_state_t *zv, major_t maj)
+zvol_size_changed(zvol_state_t *zv)
{
+#ifdef sun
+ dev_t dev = makedevice(maj, min);
+
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Size", volsize) == DDI_SUCCESS);
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
+
+ /* Notify specfs to invalidate the cached size */
+ spec_size_invalidate(dev, VBLK);
+ spec_size_invalidate(dev, VCHR);
+#else /* !sun */
struct g_provider *pp;
- g_topology_assert();
-
pp = zv->zv_provider;
if (pp == NULL)
return;
@@ -159,6 +186,7 @@ zvol_size_changed(zvol_state_t *zv, major_t maj)
if (zv->zv_total_opens > 0)
return;
pp->mediasize = zv->zv_volsize;
+#endif /* !sun */
}
int
@@ -188,17 +216,6 @@ zvol_check_volblocksize(uint64_t volblocksize)
return (0);
}
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
- zvol_state_t *zv = arg;
-
- if (newval)
- zv->zv_flags |= ZVOL_RDONLY;
- else
- zv->zv_flags &= ~ZVOL_RDONLY;
-}
-
int
zvol_get_stats(objset_t *os, nvlist_t *nv)
{
@@ -206,7 +223,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
dmu_object_info_t doi;
uint64_t val;
-
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
if (error)
return (error);
@@ -228,230 +244,24 @@ zvol_minor_lookup(const char *name)
{
struct g_provider *pp;
struct g_geom *gp;
+ zvol_state_t *zv = NULL;
- g_topology_assert();
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ g_topology_lock();
LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
- LIST_FOREACH(pp, &gp->provider, provider) {
- if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
- return (pp->private);
- }
- }
-
- return (NULL);
-}
-
-static int
-zvol_access(struct g_provider *pp, int acr, int acw, int ace)
-{
- zvol_state_t *zv;
-
- g_topology_assert();
- mutex_enter(&zvol_state_lock);
-
- zv = pp->private;
- if (zv == NULL) {
- if (acr <= 0 && acw <= 0 && ace <= 0)
- return (0);
- mutex_exit(&zvol_state_lock);
- return (pp->error);
- }
-
- ASSERT(zv->zv_objset != NULL);
-
- if (acw > 0 &&
- ((zv->zv_flags & ZVOL_RDONLY) ||
- (zv->zv_mode & DS_MODE_READONLY))) {
- mutex_exit(&zvol_state_lock);
- return (EROFS);
- }
-
- zv->zv_total_opens += acr + acw + ace;
- zvol_size_changed(zv, 0);
-
- mutex_exit(&zvol_state_lock);
-
- return (0);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
-{
- uint32_t blocksize = zv->zv_volblocksize;
- zilog_t *zilog = zv->zv_zilog;
- lr_write_t *lr;
-
- if (zilog->zl_replay) {
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
- zilog->zl_replaying_seq;
- return;
- }
-
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
- lr = (lr_write_t *)&itx->itx_lr;
- lr->lr_foid = ZVOL_OBJ;
- lr->lr_offset = off;
- lr->lr_length = nbytes;
- lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
- BP_ZERO(&lr->lr_blkptr);
-
- (void) zil_itx_assign(zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
- }
-}
-
-static void
-zvol_start(struct bio *bp)
-{
- zvol_state_t *zv;
-
- switch (bp->bio_cmd) {
- case BIO_READ:
- case BIO_WRITE:
- case BIO_FLUSH:
- zv = bp->bio_to->private;
- ASSERT(zv != NULL);
- mtx_lock(&zv->zv_queue_mtx);
- bioq_insert_tail(&zv->zv_queue, bp);
- wakeup_one(&zv->zv_queue);
- mtx_unlock(&zv->zv_queue_mtx);
- break;
- case BIO_GETATTR:
- if (g_handleattr_int(bp, "ZFS::iszvol", 1))
- break;
- /* FALLTHROUGH */
- case BIO_DELETE:
- default:
- g_io_deliver(bp, EOPNOTSUPP);
- break;
- }
-}
-
-static void
-zvol_serve_one(zvol_state_t *zv, struct bio *bp)
-{
- uint64_t off, volsize;
- size_t resid;
- char *addr;
- objset_t *os;
- rl_t *rl;
- int error = 0;
- boolean_t doread = (bp->bio_cmd == BIO_READ);
-
- off = bp->bio_offset;
- volsize = zv->zv_volsize;
-
- os = zv->zv_objset;
- ASSERT(os != NULL);
-
- addr = bp->bio_data;
- resid = bp->bio_length;
-
- error = 0;
-
- /*
- * There must be no buffer changes when doing a dmu_sync() because
- * we can't change the data whilst calculating the checksum.
- * A better approach than a per zvol rwlock would be to lock ranges.
- */
- rl = zfs_range_lock(&zv->zv_znode, off, resid,
- doread ? RL_READER : RL_WRITER);
-
- while (resid != 0 && off < volsize) {
- size_t size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
-
- if (size > volsize - off) /* don't write past the end */
- size = volsize - off;
-
- if (doread) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr,
- DMU_READ_PREFETCH);
- } else {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
- dmu_tx_commit(tx);
- }
- }
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = EIO;
- break;
- }
- off += size;
- addr += size;
- resid -= size;
- }
- zfs_range_unlock(rl);
-
- bp->bio_completed = bp->bio_length - resid;
- if (bp->bio_completed < bp->bio_length)
- bp->bio_error = (off > volsize ? EINVAL : error);
-}
-
-static void
-zvol_worker(void *arg)
-{
- zvol_state_t *zv;
- struct bio *bp;
-
- thread_lock(curthread);
- sched_prio(curthread, PRIBIO);
- thread_unlock(curthread);
-
- zv = arg;
- for (;;) {
- mtx_lock(&zv->zv_queue_mtx);
- bp = bioq_takefirst(&zv->zv_queue);
- if (bp == NULL) {
- if (zv->zv_state == 1) {
- zv->zv_state = 2;
- wakeup(&zv->zv_state);
- mtx_unlock(&zv->zv_queue_mtx);
- kthread_exit();
- }
- msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
- "zvol:io", 0);
+ pp = LIST_FIRST(&gp->provider);
+ if (pp == NULL)
continue;
- }
- mtx_unlock(&zv->zv_queue_mtx);
- switch (bp->bio_cmd) {
- case BIO_FLUSH:
- break;
- case BIO_READ:
- case BIO_WRITE:
- zvol_serve_one(zv, bp);
+ zv = pp->private;
+ if (zv == NULL)
+ continue;
+ if (strcmp(zv->zv_name, name) == 0)
break;
- }
-
- if (bp->bio_cmd == BIO_FLUSH && !zil_disable)
- zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
-
- g_io_deliver(bp, bp->bio_error);
}
+ g_topology_unlock();
+
+ return (gp != NULL ? zv : NULL);
}
/* extent mapping arg */
@@ -462,8 +272,8 @@ struct maparg {
/*ARGSUSED*/
static int
-zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct maparg *ma = arg;
zvol_extent_t *ze;
@@ -515,6 +325,7 @@ zvol_free_extents(zvol_state_t *zv)
static int
zvol_get_lbas(zvol_state_t *zv)
{
+ objset_t *os = zv->zv_objset;
struct maparg ma;
int err;
@@ -522,7 +333,9 @@ zvol_get_lbas(zvol_state_t *zv)
ma.ma_blks = 0;
zvol_free_extents(zv);
- err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0,
+ /* commit any in-flight changes before traversing the dataset */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = traverse_dataset(dmu_objset_ds(os), 0,
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
zvol_free_extents(zv);
@@ -577,25 +390,32 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
{
objset_t *os = zv->zv_objset;
char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
+ uint64_t offset, length;
dmu_tx_t *tx;
int error;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(dmu_objset_zil(os), lr);
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
dmu_tx_commit(tx);
}
@@ -636,58 +456,101 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_WRITE2 */
};
+#ifdef sun
+int
+zvol_name2minor(const char *name, minor_t *minor)
+{
+ zvol_state_t *zv;
+
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+ if (minor && zv)
+ *minor = zv->zv_minor;
+ mutex_exit(&zfsdev_state_lock);
+ return (zv ? 0 : -1);
+}
+#endif /* sun */
+
/*
* Create a minor node (plus a whole lot more) for the specified volume.
*/
int
-zvol_create_minor(const char *name, major_t maj)
+zvol_create_minor(const char *name)
{
- struct g_provider *pp;
- struct g_geom *gp;
+ zfs_soft_state_t *zs;
zvol_state_t *zv;
objset_t *os;
dmu_object_info_t doi;
- uint64_t volsize;
- int ds_mode = DS_MODE_OWNER;
int error;
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ ZFS_LOG(1, "Creating ZVOL %s...", name);
- if ((zv = zvol_minor_lookup(name)) != NULL) {
- error = EEXIST;
- goto end;
+ mutex_enter(&zfsdev_state_lock);
+
+ if (zvol_minor_lookup(name) != NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (EEXIST);
}
- if (strchr(name, '@') != 0)
- ds_mode |= DS_MODE_READONLY;
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
- error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
- if (error)
- goto end;
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
- dmu_objset_close(os);
- goto end;
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
}
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_start;
- gp->access = zvol_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
- pp->mediasize = volsize;
- pp->sectorsize = DEV_BSIZE;
+#ifdef sun
+ if ((minor = zfsdev_minor_alloc()) == 0) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
+ }
- zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
- (void) strcpy(zv->zv_name, name);
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+ (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
+ (char *)name);
+
+ (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
+
+ if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+
+ (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
+
+ if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(zfs_dip, chrbuf);
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (EAGAIN);
+ }
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_ZVOL;
+ zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+#else /* !sun */
+
+ DROP_GIANT();
+ g_topology_lock();
+ zv = zvol_geom_create(name);
+#endif /* !sun */
+
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
zv->zv_min_bs = DEV_BSHIFT;
- zv->zv_provider = pp;
- zv->zv_volsize = pp->mediasize;
zv->zv_objset = os;
- zv->zv_mode = ds_mode;
- zv->zv_zilog = zil_open(os, zvol_get_data);
+ if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
@@ -698,93 +561,134 @@ zvol_create_minor(const char *name, major_t maj)
ASSERT(error == 0);
zv->zv_volblocksize = doi.doi_data_block_size;
- zil_replay(os, zv, zvol_replay_vector);
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ dmu_objset_disown(os, FTAG);
+ zv->zv_objset = NULL;
- /* XXX this should handle the possible i/o error */
- VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
+ zvol_minors++;
- pp->private = zv;
- g_error_provider(pp, 0);
+ mutex_exit(&zfsdev_state_lock);
- bioq_init(&zv->zv_queue);
- mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
- zv->zv_state = 0;
- kproc_kthread_add(zvol_worker, zv, &zfsproc, NULL, 0, 0, "zfskern",
- "zvol %s", pp->name + strlen(ZVOL_DEV_DIR) + 1);
+ zvol_geom_run(zv);
- zvol_minors++;
-end:
- mutex_exit(&zvol_state_lock);
g_topology_unlock();
PICKUP_GIANT();
- return (error);
+ ZFS_LOG(1, "ZVOL %s created.", name);
+
+ return (0);
}
/*
* Remove minor node for the specified volume.
*/
+static int
+zvol_remove_zv(zvol_state_t *zv)
+{
+#ifdef sun
+ minor_t minor = zv->zv_minor;
+#endif
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ if (zv->zv_total_opens != 0)
+ return (EBUSY);
+
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+#ifdef sun
+ (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
+ ddi_remove_minor_node(zfs_dip, nmbuf);
+#endif /* sun */
+
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
+
+ zvol_geom_destroy(zv);
+
+ zvol_minors--;
+ return (0);
+}
+
int
zvol_remove_minor(const char *name)
{
- struct g_provider *pp;
zvol_state_t *zv;
- int error = 0;
-
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ int rc;
+ mutex_enter(&zfsdev_state_lock);
if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
+ g_topology_lock();
+ rc = zvol_remove_zv(zv);
+ g_topology_unlock();
+ mutex_exit(&zfsdev_state_lock);
+ return (rc);
+}
- if (zv->zv_total_opens != 0) {
- error = EBUSY;
- goto end;
- }
+int
+zvol_first_open(zvol_state_t *zv)
+{
+ objset_t *os;
+ uint64_t volsize;
+ int error;
+ uint64_t readonly;
- VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
+ /* lie and say we're read-only */
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
+ zvol_tag, &os);
+ if (error)
+ return (error);
- mtx_lock(&zv->zv_queue_mtx);
- zv->zv_state = 1;
- wakeup_one(&zv->zv_queue);
- while (zv->zv_state != 2)
- msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
- mtx_unlock(&zv->zv_queue_mtx);
- mtx_destroy(&zv->zv_queue_mtx);
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error) {
+ ASSERT(error == 0);
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+ zv->zv_objset = os;
+ error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+ if (error) {
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+ zv->zv_volsize = volsize;
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+ zvol_size_changed(zv);
- pp = zv->zv_provider;
- pp->private = NULL;
- g_wither_geom(pp->geom, ENXIO);
+ VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
+ NULL) == 0);
+ if (readonly || dmu_objset_is_snapshot(os) ||
+ !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+ else
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ return (error);
+}
+void
+zvol_last_close(zvol_state_t *zv)
+{
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- dmu_objset_close(zv->zv_objset);
+ dmu_buf_rele(zv->zv_dbuf, zvol_tag);
+ zv->zv_dbuf = NULL;
+ dmu_objset_disown(zv->zv_objset, zvol_tag);
zv->zv_objset = NULL;
- avl_destroy(&zv->zv_znode.z_range_avl);
- mutex_destroy(&zv->zv_znode.z_range_lock);
-
- kmem_free(zv, sizeof(*zv));
-
- zvol_minors--;
-end:
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
}
+#ifdef sun
int
zvol_prealloc(zvol_state_t *zv)
{
objset_t *os = zv->zv_objset;
dmu_tx_t *tx;
- void *data;
uint64_t refd, avail, usedobjs, availobjs;
uint64_t resid = zv->zv_volsize;
uint64_t off = 0;
@@ -797,9 +701,6 @@ zvol_prealloc(zvol_state_t *zv)
/* Free old extents if they exist */
zvol_free_extents(zv);
- /* allocate the blocks by writing each one */
- data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
-
while (resid != 0) {
int error;
uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
@@ -809,30 +710,29 @@ zvol_prealloc(zvol_state_t *zv)
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- kmem_free(data, SPA_MAXBLOCKSIZE);
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
return (error);
}
- dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
+ dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
dmu_tx_commit(tx);
off += bytes;
resid -= bytes;
}
- kmem_free(data, SPA_MAXBLOCKSIZE);
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
+#endif /* sun */
int
-zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
+zvol_update_volsize(objset_t *os, uint64_t volsize)
{
dmu_tx_t *tx;
int error;
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
- tx = dmu_tx_create(zv->zv_objset);
+ tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -840,140 +740,232 @@ zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
return (error);
}
- error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
&volsize, tx);
dmu_tx_commit(tx);
if (error == 0)
- error = dmu_free_long_range(zv->zv_objset,
+ error = dmu_free_long_range(os,
ZVOL_OBJ, volsize, DMU_OBJECT_END);
+ return (error);
+}
- /*
- * If we are using a faked-up state (zv_provider == NULL) then don't
- * try to update the in-core zvol state.
- */
- if (error == 0 && zv->zv_provider) {
- zv->zv_volsize = volsize;
- zvol_size_changed(zv, maj);
+void
+zvol_remove_minors(const char *name)
+{
+ struct g_provider *pp, *pptmp;
+ struct g_geom *gp, *gptmp;
+ zvol_state_t *zv;
+ char *namebuf;
+
+ namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+ (void) strncpy(namebuf, name, strlen(name));
+ (void) strcat(namebuf, "/");
+ DROP_GIANT();
+ mutex_enter(&zfsdev_state_lock);
+ g_topology_lock();
+
+ LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
+ pp = LIST_FIRST(&gp->provider);
+ if (pp == NULL)
+ continue;
+ zv = pp->private;
+ if (zv == NULL)
+ continue;
+ if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+ (void) zvol_remove_zv(zv);
}
- return (error);
+ kmem_free(namebuf, strlen(name) + 2);
+
+ g_topology_unlock();
+ mutex_exit(&zfsdev_state_lock);
+ PICKUP_GIANT();
}
int
zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
{
- zvol_state_t *zv;
+ zvol_state_t *zv = NULL;
+ objset_t *os;
int error;
dmu_object_info_t doi;
uint64_t old_volsize = 0ULL;
- zvol_state_t state = { 0 };
-
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ uint64_t readonly;
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- /*
- * If we are doing a "zfs clone -o volsize=", then the
- * minor node won't exist yet.
- */
- error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
- &state.zv_objset);
- if (error != 0)
- goto out;
- zv = &state;
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
}
- old_volsize = zv->zv_volsize;
- if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+ if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
(error = zvol_check_volsize(volsize,
doi.doi_data_block_size)) != 0)
goto out;
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
+ VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
+ NULL) == 0);
+ if (readonly) {
error = EROFS;
goto out;
}
- error = zvol_update_volsize(zv, maj, volsize);
-
-#if 0
+ error = zvol_update_volsize(os, volsize);
/*
* Reinitialize the dump area to the new size. If we
- * failed to resize the dump area then restore the it back to
- * it's original size.
+ * failed to resize the dump area then restore it back to
+ * its original size.
*/
- if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
- if ((error = zvol_dumpify(zv)) != 0 ||
- (error = dumpvp_resize()) != 0) {
- (void) zvol_update_volsize(zv, maj, old_volsize);
- error = zvol_dumpify(zv);
+ if (zv && error == 0) {
+#ifdef ZVOL_DUMP
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ old_volsize = zv->zv_volsize;
+ zv->zv_volsize = volsize;
+ if ((error = zvol_dumpify(zv)) != 0 ||
+ (error = dumpvp_resize()) != 0) {
+ (void) zvol_update_volsize(os, old_volsize);
+ zv->zv_volsize = old_volsize;
+ error = zvol_dumpify(zv);
+ }
+ }
+#endif /* ZVOL_DUMP */
+ if (error == 0) {
+ zv->zv_volsize = volsize;
+ zvol_size_changed(zv);
}
}
-#endif
+
+#ifdef sun
+ /*
+ * Generate a LUN expansion event.
+ */
+ if (zv && error == 0) {
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
+ zv->zv_minor);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+#endif /* sun */
out:
- if (state.zv_objset)
- dmu_objset_close(state.zv_objset);
+ dmu_objset_rele(os, FTAG);
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+/*ARGSUSED*/
+static int
+zvol_open(struct g_provider *pp, int flag, int count)
{
zvol_state_t *zv;
- dmu_tx_t *tx;
- int error;
+ int err = 0;
- DROP_GIANT();
- g_topology_lock();
- mutex_enter(&zvol_state_lock);
+ mutex_enter(&zfsdev_state_lock);
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
+ zv = pp->private;
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
- error = EROFS;
- goto end;
+
+ if (zv->zv_total_opens == 0)
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = EROFS;
+ goto out;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = EBUSY;
+ goto out;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_total_opens != 0) {
+ err = EBUSY;
+ goto out;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
}
+#endif
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
- volblocksize, 0, tx);
- if (error == ENOTSUP)
- error = EBUSY;
- dmu_tx_commit(tx);
- if (error == 0)
- zv->zv_volblocksize = volblocksize;
+ zv->zv_total_opens += count;
+ mutex_exit(&zfsdev_state_lock);
+
+ return (err);
+out:
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+zvol_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int error = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = pp->private;
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
}
-end:
- mutex_exit(&zvol_state_lock);
- g_topology_unlock();
- PICKUP_GIANT();
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_total_opens == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_total_opens -= count;
+
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
+static void
+zvol_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_range_unlock(zgd->zgd_rl);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
}
@@ -985,15 +977,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
{
zvol_state_t *zv = arg;
objset_t *os = zv->zv_objset;
+ uint64_t object = ZVOL_OBJ;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length; /* length of user data */
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- uint64_t boff; /* block starting offset */
- int dlen = lr->lr_length; /* length of user data */
int error;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
+
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -1002,97 +999,717 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
- if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
- DMU_READ_NO_PREFETCH));
+ if (buf != NULL) { /* immediate write */
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else {
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN(offset, size);
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_bp = &lr->lr_blkptr;
+ zvol_get_done(zgd, error);
- /*
- * Lock the range of the block to ensure that when the data is
- * written out and its checksum is being calculated that no other
- * thread can change the block.
- */
- boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
- rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
- RL_READER);
- zgd->zgd_rl = rl;
+ return (error);
+}
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ zilog_t *zilog = zv->zv_zilog;
+ boolean_t slogging;
+ ssize_t immediate_write_sz;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ ? 0 : zvol_immediate_write_sz;
+
+ slogging = spa_has_slogs(zilog->zl_spa) &&
+ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ ssize_t len;
+ itx_wr_state_t write_state;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0) {
/*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to
- * replay.
+ * Unlike zfs_log_write() we can be called with
+ * upto DMU_MAX_ACCESS/2 (5MB) writes.
*/
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+ if (blocksize > immediate_write_sz && !slogging &&
+ resid >= blocksize && off % blocksize == 0) {
+ write_state = WR_INDIRECT; /* uses dmu_sync */
+ len = blocksize;
+ } else if (sync) {
+ write_state = WR_COPIED;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ } else {
+ write_state = WR_NEED_COPY;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ }
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = write_state;
+ if (write_state == WR_NEED_COPY)
+ itx->itx_sod += len;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zv;
+ itx->itx_sync = sync;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ off += len;
+ resid -= len;
+ }
+}
+
+#ifdef sun
+static int
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
+ boolean_t doread, boolean_t isdump)
+{
+ vdev_disk_t *dvd;
+ int c;
+ int numerrors = 0;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (numerrors < vd->vdev_children ? 0 : EIO);
+
+ if (doread && !vdev_readable(vd))
+ return (EIO);
+ else if (!doread && !vdev_writeable(vd))
+ return (EIO);
+
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ offset += VDEV_LABEL_START_SIZE;
+
+ if (ddi_in_panic() || isdump) {
+ ASSERT(!doread);
+ if (doread)
+ return (EIO);
+ return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
+ lbtodb(size)));
+ } else {
+ return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
+ doread ? B_READ : B_WRITE));
}
+}
+
+static int
+zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
+ boolean_t doread, boolean_t isdump)
+{
+ vdev_t *vd;
+ int error;
+ zvol_extent_t *ze;
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+
+ /* Must be sector aligned, and not stradle a block boundary. */
+ if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
+ P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
+ return (EINVAL);
+ }
+ ASSERT(size <= zv->zv_volblocksize);
+
+ /* Locate the extent this belongs to */
+ ze = list_head(&zv->zv_extents);
+ while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
+ offset -= ze->ze_nblks * zv->zv_volblocksize;
+ ze = list_next(&zv->zv_extents, ze);
+ }
+
+ if (!ddi_in_panic())
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
+ offset += DVA_GET_OFFSET(&ze->ze_dva);
+ error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+
+ if (!ddi_in_panic())
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ return (error);
+}
+#endif /* sun */
+
+int
+zvol_strategy(struct bio *bp)
+{
+ zvol_state_t *zv = bp->bio_to->private;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ rl_t *rl;
+ int error = 0;
+ boolean_t doread = (bp->bio_cmd == BIO_READ);
+ boolean_t sync;
+
+ if (zv == NULL) {
+ g_io_deliver(bp, ENXIO);
+ return (0);
+ }
+
+ if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+ g_io_deliver(bp, EROFS);
+ return (0);
+ }
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ g_io_deliver(bp, EIO);
+ return (0);
+ }
+
+ sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
/*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zvol_get_done() callback.
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
*/
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
+ rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+ if (doread) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
zfs_range_unlock(rl);
- kmem_free(zgd, sizeof (zgd_t));
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length)
+ bp->bio_error = (off > volsize ? EINVAL : error);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+
+ return (0);
+}
+
+#ifdef sun
+/*
+ * Set the buffer count to the zvol maximum transfer.
+ * Using our own routine instead of the default minphys()
+ * means that for larger writes we write bigger buffers on X86
+ * (128K instead of 56K) and flush the disk write cache less often
+ * (every zvol_maxphys - currently 1MB) instead of minphys (currently
+ * 56K on X86 and 128K on sparc).
+ */
+void
+zvol_minphys(struct buf *bp)
+{
+ if (bp->b_bcount > zvol_maxphys)
+ bp->b_bcount = zvol_maxphys;
+}
+
+int
+zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ int error = 0;
+ uint64_t size;
+ uint64_t boff;
+ uint64_t resid;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ boff = ldbtob(blkno);
+ resid = ldbtob(nblocks);
+
+ VERIFY3U(boff + resid, <=, zv->zv_volsize);
+
+ while (resid) {
+ size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
+ error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
+ if (error)
+ break;
+ boff += size;
+ addr += size;
+ resid -= size;
+ }
+
return (error);
}
+/*ARGSUSED*/
int
-zvol_busy(void)
+zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
{
- return (zvol_minors != 0);
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ uint64_t volsize;
+ rl_t *rl;
+ int error = 0;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ volsize = zv->zv_volsize;
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ return (EIO);
+
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_READ,
+ zvol_minphys, uio);
+ return (error);
+ }
+
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_READER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio->uio_loffset)
+ bytes = volsize - uio->uio_loffset;
+
+ error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
+ break;
+ }
+ }
+ zfs_range_unlock(rl);
+ return (error);
}
-void
-zvol_init(void)
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
{
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
- ZFS_LOG(1, "ZVOL Initialized.");
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ uint64_t volsize;
+ rl_t *rl;
+ int error = 0;
+ boolean_t sync;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+
+ volsize = zv->zv_volsize;
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ return (EIO);
+
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_WRITE,
+ zvol_minphys, uio);
+ return (error);
+ }
+
+ sync = !(zv->zv_flags & ZVOL_WCE) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_WRITER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio->uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_range_unlock(rl);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ return (error);
}
+int
+zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
+{
+ struct uuid uuid = EFI_RESERVED;
+ efi_gpe_t gpe = { 0 };
+ uint32_t crc;
+ dk_efi_t efi;
+ int length;
+ char *ptr;
+
+ if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
+ return (EFAULT);
+ ptr = (char *)(uintptr_t)efi.dki_data_64;
+ length = efi.dki_length;
+ /*
+ * Some clients may attempt to request a PMBR for the
+ * zvol. Currently this interface will return EINVAL to
+ * such requests. These requests could be supported by
+ * adding a check for lba == 0 and consing up an appropriate
+ * PMBR.
+ */
+ if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
+ return (EINVAL);
+
+ gpe.efi_gpe_StartingLBA = LE_64(34ULL);
+ gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
+ UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+
+ if (efi.dki_lba == 1) {
+ efi_gpt_t gpt = { 0 };
+
+ gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+ gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
+ gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+ gpt.efi_gpt_MyLBA = LE_64(1ULL);
+ gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
+ gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
+ gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
+ gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+ gpt.efi_gpt_SizeOfPartitionEntry =
+ LE_32(sizeof (efi_gpe_t));
+ CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+ gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+ CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+ gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+ if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
+ flag))
+ return (EFAULT);
+ ptr += sizeof (gpt);
+ length -= sizeof (gpt);
+ }
+ if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
+ length), flag))
+ return (EFAULT);
+ return (0);
+}
+
+/*
+ * BEGIN entry points to allow external callers access to the volume.
+ */
+/*
+ * Return the volume parameters needed for access from an external caller.
+ * These values are invariant as long as the volume is held open.
+ */
+int
+zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl)
+{
+ zvol_state_t *zv;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (ENXIO);
+ if (zv->zv_flags & ZVOL_DUMPIFIED)
+ return (ENXIO);
+
+ ASSERT(blksize && max_xfer_len && minor_hdl &&
+ objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+
+ *blksize = zv->zv_volblocksize;
+ *max_xfer_len = (uint64_t)zvol_maxphys;
+ *minor_hdl = zv;
+ *objset_hdl = zv->zv_objset;
+ *zil_hdl = zv->zv_zilog;
+ *rl_hdl = &zv->zv_znode;
+ *bonus_hdl = zv->zv_dbuf;
+ return (0);
+}
+
+/*
+ * Return the current volume size to an external caller.
+ * The size can change while the volume is open.
+ */
+uint64_t
+zvol_get_volume_size(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return (zv->zv_volsize);
+}
+
+/*
+ * Return the current WCE setting to an external caller.
+ * The WCE setting can change while the volume is open.
+ */
+int
+zvol_get_volume_wce(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
+}
+
+/*
+ * Entry point for external callers to zvol_log_write
+ */
void
-zvol_fini(void)
+zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
{
- mutex_destroy(&zvol_state_lock);
- ZFS_LOG(1, "ZVOL Deinitialized.");
+ zvol_state_t *zv = minor_hdl;
+
+ zvol_log_write(zv, tx, off, resid, sync);
}
+/*
+ * END entry points to allow external callers access to the volume.
+ */
-static boolean_t
-zvol_is_swap(zvol_state_t *zv)
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
{
- vnode_t *vp;
- boolean_t ret = B_FALSE;
- char *devpath;
- size_t devpathlen;
- int error;
+ zvol_state_t *zv;
+ struct dk_cinfo dki;
+ struct dk_minfo dkm;
+ struct dk_callback *dkc;
+ int error = 0;
+ rl_t *rl;
-#if 0
- devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
- devpath = kmem_alloc(devpathlen, KM_SLEEP);
- (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
- error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- kmem_free(devpath, devpathlen);
+ mutex_enter(&zfsdev_state_lock);
- ret = !error && IS_SWAPVP(common_specvp(vp));
+ zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
- if (vp != NULL)
- VN_RELE(vp);
-#endif
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (ENXIO);
+ }
+ ASSERT(zv->zv_total_opens > 0);
+
+ switch (cmd) {
+
+ case DKIOCINFO:
+ bzero(&dki, sizeof (dki));
+ (void) strcpy(dki.dki_cname, "zvol");
+ (void) strcpy(dki.dki_dname, "zvol");
+ dki.dki_ctype = DKC_UNKNOWN;
+ dki.dki_unit = getminor(dev);
+ dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGMEDIAINFO:
+ bzero(&dkm, sizeof (dkm));
+ dkm.dki_lbsize = 1U << zv->zv_min_bs;
+ dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkm.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGETEFI:
+ {
+ uint64_t vs = zv->zv_volsize;
+ uint8_t bs = zv->zv_min_bs;
+
+ mutex_exit(&zfsdev_state_lock);
+ error = zvol_getefi((void *)arg, flag, vs, bs);
+ return (error);
+ }
- return (ret);
+ case DKIOCFLUSHWRITECACHE:
+ dkc = (struct dk_callback *)arg;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
+ (*dkc->dkc_callback)(dkc->dkc_cookie, error);
+ error = 0;
+ }
+ return (error);
+
+ case DKIOCGETWCE:
+ {
+ int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+ if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+ flag))
+ error = EFAULT;
+ break;
+ }
+ case DKIOCSETWCE:
+ {
+ int wce;
+ if (ddi_copyin((void *)arg, &wce, sizeof (int),
+ flag)) {
+ error = EFAULT;
+ break;
+ }
+ if (wce) {
+ zv->zv_flags |= ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ } else {
+ zv->zv_flags &= ~ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+ return (0);
+ }
+
+ case DKIOCGGEOM:
+ case DKIOCGVTOC:
+ /*
+ * commands using these (like prtvtoc) expect ENOTSUP
+ * since we're emulating an EFI label
+ */
+ error = ENOTSUP;
+ break;
+
+ case DKIOCDUMPINIT:
+ rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dumpify(zv);
+ zfs_range_unlock(rl);
+ break;
+
+ case DKIOCDUMPFINI:
+ if (!(zv->zv_flags & ZVOL_DUMPIFIED))
+ break;
+ rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dump_fini(zv);
+ zfs_range_unlock(rl);
+ break;
+
+ default:
+ error = ENOTTY;
+ break;
+
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+}
+#endif /* sun */
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
+ 1) == 0);
+ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ ZFS_LOG(1, "ZVOL Initialized.");
}
+void
+zvol_fini(void)
+{
+ mutex_destroy(&zfsdev_state_lock);
+ ddi_soft_state_fini(&zfsdev_state);
+ ZFS_LOG(1, "ZVOL Deinitialized.");
+}
+
+#ifdef sun
static int
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
{
@@ -1100,11 +1717,17 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
int error = 0;
objset_t *os = zv->zv_objset;
nvlist_t *nv = NULL;
+ uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
+ DMU_OBJECT_END);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -1122,7 +1745,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&zv->zv_volsize, tx);
} else {
- uint64_t checksum, compress, refresrv, vbs;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
error = dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
@@ -1132,6 +1755,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
error = error ? error : dsl_prop_get_integer(zv->zv_name,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
+ if (version >= SPA_VERSION_DEDUP) {
+ error = error ? error :
+ dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+ }
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
@@ -1144,17 +1772,18 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
&vbs, tx);
+ error = error ? error : dmu_object_set_blocksize(
+ os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+ if (version >= SPA_VERSION_DEDUP) {
+ error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
+ &dedup, tx);
+ }
+ if (error == 0)
+ zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
- /* Truncate the file */
- if (!error)
- error = dmu_free_long_range(zv->zv_objset,
- ZVOL_OBJ, 0, DMU_OBJECT_END);
-
- if (error)
- return (error);
-
/*
* We only need update the zvol's property if we are initializing
* the dump area for the first time.
@@ -1169,11 +1798,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
ZIO_CHECKSUM_OFF) == 0);
- VERIFY(nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- SPA_MAXBLOCKSIZE) == 0);
+ if (version >= SPA_VERSION_DEDUP) {
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP),
+ ZIO_CHECKSUM_OFF) == 0);
+ }
- error = zfs_set_prop_nvlist(zv->zv_name, nv);
+ error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
nvlist_free(nv);
if (error)
@@ -1193,15 +1825,9 @@ zvol_dumpify(zvol_state_t *zv)
dmu_tx_t *tx;
objset_t *os = zv->zv_objset;
- if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
+ if (zv->zv_flags & ZVOL_RDONLY)
return (EROFS);
- /*
- * We do not support swap devices acting as dump devices.
- */
- if (zvol_is_swap(zv))
- return (ENOTSUP);
-
if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
@@ -1251,7 +1877,8 @@ zvol_dump_fini(zvol_state_t *zv)
objset_t *os = zv->zv_objset;
nvlist_t *nv;
int error = 0;
- uint64_t checksum, compress, refresrv, vbs;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
+ uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
/*
* Attempt to restore the zvol back to its pre-dumpified state.
@@ -1286,14 +1913,312 @@ zvol_dump_fini(zvol_state_t *zv)
zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
(void) nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
- (void) nvlist_add_uint64(nv,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
- (void) zfs_set_prop_nvlist(zv->zv_name, nv);
+ if (version >= SPA_VERSION_DEDUP &&
+ zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
+ }
+ (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
nvlist_free(nv);
zvol_free_extents(zv);
zv->zv_flags &= ~ZVOL_DUMPIFIED;
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+ zv->zv_volblocksize = vbs;
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+#endif /* sun */
+
+static zvol_state_t *
+zvol_geom_create(const char *name)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+ zvol_state_t *zv;
+
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->sectorsize = DEV_BSIZE;
+
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ zv->zv_provider = pp;
+ zv->zv_state = 0;
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+
+ pp->private = zv;
+
+ return (zv);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ pp = zv->zv_provider;
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ g_topology_assert();
+
+ mtx_lock(&zv->zv_queue_mtx);
+ zv->zv_state = 1;
+ wakeup_one(&zv->zv_queue);
+ while (zv->zv_state != 2)
+ msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+ mtx_destroy(&zv->zv_queue_mtx);
+
+ pp = zv->zv_provider;
+ zv->zv_provider = NULL;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+
+ kmem_free(zv, sizeof(*zv));
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
+ * because GEOM already handles that and handles it a bit differently.
+ * GEOM allows for multiple read/exclusive consumers and ZFS allows
+ * only one exclusive consumer, no matter if it is reader or writer.
+ * I like better the way GEOM works so I'll leave it for GEOM to
+ * decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_open(pp, flags, count);
+ else
+ error = zvol_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_start(struct bio *bp)
+{
+ zvol_state_t *zv;
+ boolean_t first;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+ mtx_lock(&zv->zv_queue_mtx);
+ first = (bioq_first(&zv->zv_queue) == NULL);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (first)
+ wakeup_one(&zv->zv_queue);
+ break;
+ case BIO_GETATTR:
+ case BIO_DELETE:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv;
+ struct bio *bp;
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ zv = arg;
+ for (;;) {
+ mtx_lock(&zv->zv_queue_mtx);
+ bp = bioq_takefirst(&zv->zv_queue);
+ if (bp == NULL) {
+ if (zv->zv_state == 1) {
+ zv->zv_state = 2;
+ wakeup(&zv->zv_state);
+ mtx_unlock(&zv->zv_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+ "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zv->zv_queue_mtx);
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ zvol_strategy(bp);
+ break;
+ }
+ }
+}
+
+extern boolean_t dataset_name_hidden(const char *name);
+
+static int
+zvol_create_snapshots(objset_t *os, const char *name)
+{
+ uint64_t cookie, obj;
+ char *sname;
+ int error, len;
+
+ cookie = obj = 0;
+ sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
+ DS_FIND_SNAPSHOTS);
+
+ for (;;) {
+ len = snprintf(sname, MAXPATHLEN, "%s@", name);
+ if (len >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ error = ENAMETOOLONG;
+ break;
+ }
+
+ error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
+ sname + len, &obj, &cookie, NULL);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ if ((error = zvol_create_minor(sname)) != 0) {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ sname, error);
+ break;
+ }
+ }
+
+ kmem_free(sname, MAXPATHLEN);
+ return (error);
+}
+
+int
+zvol_create_minors(const char *name)
+{
+ uint64_t cookie;
+ objset_t *os;
+ char *osname, *p;
+ int error, len;
+
+ if (dataset_name_hidden(name))
+ return (0);
+
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ if (dmu_objset_type(os) == DMU_OST_ZVOL) {
+ if ((error = zvol_create_minor(name)) == 0)
+ error = zvol_create_snapshots(os, name);
+ else {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ name, error);
+ }
+ dmu_objset_rele(os, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+
+ osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
+ return (ENOENT);
+ }
+ p = osname + strlen(osname);
+ len = MAXPATHLEN - (p - osname);
+
+ if (strchr(name, '/') == NULL) {
+ /* Prefetch only for pool name. */
+ cookie = 0;
+ while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
+ (void) dmu_objset_prefetch(p, NULL);
+ }
+
+ cookie = 0;
+ while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
+ &cookie) == 0) {
+ dmu_objset_rele(os, FTAG);
+ (void)zvol_create_minors(osname);
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ }
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
index 59ee7818f56d..da397a5cca95 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/types.h>
#include <sys/time.h>
@@ -319,9 +317,9 @@ callb_generic_cpr(void *arg, int code)
#ifdef CPR_NOT_THREAD_SAFE
while (!(cp->cc_events & CALLB_CPR_SAFE))
/* cv_timedwait() returns -1 if it times out. */
- if ((ret = cv_timedwait(&cp->cc_callb_cv,
- cp->cc_lockp,
- callb_timeout_sec * hz)) == -1)
+ if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+ cp->cc_lockp, (callb_timeout_sec * hz),
+ TR_CLOCK_TICK)) == -1)
break;
#endif
break;
@@ -370,5 +368,71 @@ callb_unlock_table(void)
mutex_exit(&ct->ct_lock);
}
+#ifdef sun
+/*
+ * Return a boolean value indicating whether a particular kernel thread is
+ * stopped in accordance with the cpr callback protocol. If returning
+ * false, also return a pointer to the thread name via the 2nd argument.
+ */
+boolean_t
+callb_is_stopped(kthread_id_t tp, caddr_t *thread_name)
+{
+ callb_t *cp;
+ boolean_t ret_val;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON];
+ cp != NULL && tp != cp->c_thread; cp = cp->c_next)
+ ;
+
+ ret_val = (cp != NULL);
+ if (ret_val) {
+ /*
+ * We found the thread in the callback table and have
+ * provisionally set the return value to true. Now
+ * see if it is marked "safe" and is sleeping or stopped.
+ */
+ callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg;
+
+ *thread_name = cp->c_name; /* in case not stopped */
+ mutex_enter(ccp->cc_lockp);
+
+ if (ccp->cc_events & CALLB_CPR_SAFE) {
+ int retry;
+
+ mutex_exit(ccp->cc_lockp);
+ for (retry = 0; retry < CALLB_MAX_RETRY; retry++) {
+ thread_lock(tp);
+ if (tp->t_state & (TS_SLEEP | TS_STOPPED)) {
+ thread_unlock(tp);
+ break;
+ }
+ thread_unlock(tp);
+ delay(CALLB_THREAD_DELAY);
+ }
+ ret_val = retry < CALLB_MAX_RETRY;
+ } else {
+ ret_val =
+ (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0;
+ mutex_exit(ccp->cc_lockp);
+ }
+ } else {
+ /*
+ * Thread not found in callback table. Make the best
+ * attempt to identify the thread in the error message.
+ */
+ ulong_t offset;
+ char *sym = kobj_getsymname((uintptr_t)tp->t_startpc,
+ &offset);
+
+ *thread_name = sym ? sym : "*unknown*";
+ }
+
+ mutex_exit(&ct->ct_lock);
+ return (ret_val);
+}
+#endif /* sun */
+
SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
new file mode 100644
index 000000000000..3c9ba51e095e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
@@ -0,0 +1,1402 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Fault Management Architecture (FMA) Resource and Protocol Support
+ *
+ * The routines contained herein provide services to support kernel subsystems
+ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
+ *
+ * Name-Value Pair Lists
+ *
+ * The embodiment of an FMA protocol element (event, fmri or authority) is a
+ * name-value pair list (nvlist_t). FMA-specific nvlist construtor and
+ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
+ * to create an nvpair list using custom allocators. Callers may choose to
+ * allocate either from the kernel memory allocator, or from a preallocated
+ * buffer, useful in constrained contexts like high-level interrupt routines.
+ *
+ * Protocol Event and FMRI Construction
+ *
+ * Convenience routines are provided to construct nvlist events according to
+ * the FMA Event Protocol and Naming Schema specification for ereports and
+ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
+ *
+ * ENA Manipulation
+ *
+ * Routines to generate ENA formats 0, 1 and 2 are available as well as
+ * routines to increment formats 1 and 2. Individual fields within the
+ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
+ * fm_ena_format_get() and fm_ena_gen_get().
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysevent.h>
+#include <sys/nvpair.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/compress.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/kstat.h>
+#include <sys/processor.h>
+#include <sys/pcpu.h>
+#include <sys/sunddi.h>
+#include <sys/systeminfo.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+
+/*
+ * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These
+ * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
+ */
+static const char *fm_url = "http://www.sun.com/msg";
+static const char *fm_msgid = "SUNOS-8000-0G";
+static char *volatile fm_panicstr = NULL;
+
+#ifdef sun
+errorq_t *ereport_errorq;
+#endif
+void *ereport_dumpbuf;
+size_t ereport_dumplen;
+
+static uint_t ereport_chanlen = ERPT_EVCH_MAX;
+static evchan_t *ereport_chan = NULL;
+static ulong_t ereport_qlen = 0;
+static size_t ereport_size = 0;
+static int ereport_cols = 80;
+
+extern void fastreboot_disable_highpil(void);
+
+/*
+ * Common fault management kstats to record ereport generation
+ * failures
+ */
+
+struct erpt_kstat {
+ kstat_named_t erpt_dropped; /* num erpts dropped on post */
+ kstat_named_t erpt_set_failed; /* num erpt set failures */
+ kstat_named_t fmri_set_failed; /* num fmri set failures */
+ kstat_named_t payload_set_failed; /* num payload set failures */
+};
+
+static struct erpt_kstat erpt_kstat_data = {
+ { "erpt-dropped", KSTAT_DATA_UINT64 },
+ { "erpt-set-failed", KSTAT_DATA_UINT64 },
+ { "fmri-set-failed", KSTAT_DATA_UINT64 },
+ { "payload-set-failed", KSTAT_DATA_UINT64 }
+};
+
+#ifdef sun
+/*ARGSUSED*/
+static void
+fm_drain(void *private, void *data, errorq_elem_t *eep)
+{
+ nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
+
+ if (!panicstr)
+ (void) fm_ereport_post(nvl, EVCH_TRYHARD);
+ else
+ fm_nvprint(nvl);
+}
+#endif
+
+void
+fm_init(void)
+{
+ kstat_t *ksp;
+
+#ifdef sun
+ (void) sysevent_evc_bind(FM_ERROR_CHAN,
+ &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
+
+ (void) sysevent_evc_control(ereport_chan,
+ EVCH_SET_CHAN_LEN, &ereport_chanlen);
+#endif
+
+ if (ereport_qlen == 0)
+ ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
+
+ if (ereport_size == 0)
+ ereport_size = ERPT_DATA_SZ;
+
+#ifdef sun
+ ereport_errorq = errorq_nvcreate("fm_ereport_queue",
+ (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
+ FM_ERR_PIL, ERRORQ_VITAL);
+ if (ereport_errorq == NULL)
+ panic("failed to create required ereport error queue");
+#endif
+
+ ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
+ ereport_dumplen = ereport_size;
+
+ /* Initialize ereport allocation and generation kstats */
+ ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
+ sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp != NULL) {
+ ksp->ks_data = &erpt_kstat_data;
+ kstat_install(ksp);
+ } else {
+ cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
+
+ }
+}
+
+#ifdef sun
+/*
+ * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
+ * output so they aren't split across console lines, and return the end column.
+ */
+/*PRINTFLIKE4*/
+static int
+fm_printf(int depth, int c, int cols, const char *format, ...)
+{
+ va_list ap;
+ int width;
+ char c1;
+
+ va_start(ap, format);
+ width = vsnprintf(&c1, sizeof (c1), format, ap);
+ va_end(ap);
+
+ if (c + width >= cols) {
+ console_printf("\n\r");
+ c = 0;
+ if (format[0] != ' ' && depth > 0) {
+ console_printf(" ");
+ c++;
+ }
+ }
+
+ va_start(ap, format);
+ console_vprintf(format, ap);
+ va_end(ap);
+
+ return ((c + width) % cols);
+}
+
+/*
+ * Recursively print a nvlist in the specified column width and return the
+ * column we end up in. This function is called recursively by fm_nvprint(),
+ * below. We generically format the entire nvpair using hexadecimal
+ * integers and strings, and elide any integer arrays. Arrays are basically
+ * used for cache dumps right now, so we suppress them so as not to overwhelm
+ * the amount of console output we produce at panic time. This can be further
+ * enhanced as FMA technology grows based upon the needs of consumers. All
+ * FMA telemetry is logged using the dump device transport, so the console
+ * output serves only as a fallback in case this procedure is unsuccessful.
+ */
+static int
+fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
+{
+ nvpair_t *nvp;
+
+ for (nvp = nvlist_next_nvpair(nvl, NULL);
+ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ data_type_t type = nvpair_type(nvp);
+ const char *name = nvpair_name(nvp);
+
+ boolean_t b;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+ nvlist_t *cnv;
+
+ if (strcmp(name, FM_CLASS) == 0)
+ continue; /* already printed by caller */
+
+ c = fm_printf(d, c, cols, " %s=", name);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ c = fm_printf(d + 1, c, cols, " 1");
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ c = fm_printf(d + 1, c, cols, b ? "1" : "0");
+ break;
+
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (void *)&i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (void *)&i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (void *)&i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ c = fm_printf(d + 1, c, cols, "\"%s\"",
+ str ? str : "<NULL>");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist(nvp, &cnv);
+ c = fm_nvprintr(cnv, d + 1, c, cols);
+ c = fm_printf(d + 1, c, cols, " ]");
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++) {
+ c = fm_nvprintr(val[i], d + 1, c, cols);
+ }
+ c = fm_printf(d + 1, c, cols, " ]");
+ }
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ c = fm_printf(d + 1, c, cols, "[...]");
+ break;
+ case DATA_TYPE_UNKNOWN:
+ c = fm_printf(d + 1, c, cols, "<unknown>");
+ break;
+ }
+ }
+
+ return (c);
+}
+
+void
+fm_nvprint(nvlist_t *nvl)
+{
+ char *class;
+ int c = 0;
+
+ console_printf("\r");
+
+ if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
+ c = fm_printf(0, c, ereport_cols, "%s", class);
+
+ if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
+ console_printf("\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Wrapper for panic() that first produces an FMA-style message for admins.
+ * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
+ * is the one exception to that rule and the only error that gets messaged.
+ * This function is intended for use by subsystems that have detected a fatal
+ * error and enqueued appropriate ereports and wish to then force a panic.
+ */
+/*PRINTFLIKE1*/
+void
+fm_panic(const char *format, ...)
+{
+ va_list ap;
+
+ (void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+#if defined(__i386) || defined(__amd64)
+ fastreboot_disable_highpil();
+#endif /* __i386 || __amd64 */
+ va_start(ap, format);
+ vpanic(format, ap);
+ va_end(ap);
+}
+
+/*
+ * Simply tell the caller if fm_panicstr is set, ie. an fma event has
+ * caused the panic. If so, something other than the default panic
+ * diagnosis method will diagnose the cause of the panic.
+ */
+int
+is_fm_panic()
+{
+ if (fm_panicstr)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Print any appropriate FMA banner message before the panic message. This
+ * function is called by panicsys() and prints the message for fm_panic().
+ * We print the message here so that it comes after the system is quiesced.
+ * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
+ * The rest of the message is for the console only and not needed in the log,
+ * so it is printed using console_printf(). We break it up into multiple
+ * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
+ */
+void
+fm_banner(void)
+{
+ timespec_t tod;
+ hrtime_t now;
+
+ if (!fm_panicstr)
+ return; /* panic was not initiated by fm_panic(); do nothing */
+
+ if (panicstr) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
+ "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
+
+ console_printf(
+"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
+"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
+ fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
+
+ console_printf(
+"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
+"SOURCE: %s, REV: %s %s\n",
+ platform, utsname.nodename, utsname.sysname,
+ utsname.release, utsname.version);
+
+ console_printf(
+"DESC: Errors have been detected that require a reboot to ensure system\n"
+"integrity. See %s/%s for more information.\n",
+ fm_url, fm_msgid);
+
+ console_printf(
+"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
+"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
+"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Utility function to write all of the pending ereports to the dump device.
+ * This function is called at either normal reboot or panic time, and simply
+ * iterates over the in-transit messages in the ereport sysevent channel.
+ */
+void
+fm_ereport_dump(void)
+{
+ evchanq_t *chq;
+ sysevent_t *sep;
+ erpt_dump_t ed;
+
+ timespec_t tod;
+ hrtime_t now;
+ char *buf;
+ size_t len;
+
+ if (panicstr) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ if (ereport_errorq != NULL)
+ errorq_drain(ereport_errorq);
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ /*
+ * In the panic case, sysevent_evc_walk_init() will return NULL.
+ */
+ if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
+ !panicstr)
+ return; /* event channel isn't initialized yet */
+
+ while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
+ if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
+ break;
+
+ ed.ed_magic = ERPT_MAGIC;
+ ed.ed_chksum = checksum32(buf, len);
+ ed.ed_size = (uint32_t)len;
+ ed.ed_pad = 0;
+ ed.ed_hrt_nsec = SE_TIME(sep);
+ ed.ed_hrt_base = now;
+ ed.ed_tod_base.sec = tod.tv_sec;
+ ed.ed_tod_base.nsec = tod.tv_nsec;
+
+ dumpvp_write(&ed, sizeof (ed));
+ dumpvp_write(buf, len);
+ }
+
+ sysevent_evc_walk_fini(chq);
+}
+#endif
+
+/*
+ * Post an error report (ereport) to the sysevent error channel. The error
+ * channel must be established with a prior call to sysevent_evc_create()
+ * before publication may occur.
+ */
+void
+fm_ereport_post(nvlist_t *ereport, int evc_flag)
+{
+ size_t nvl_size = 0;
+ evchan_t *error_chan;
+ sysevent_id_t eid;
+
+ (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
+ if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ return;
+ }
+
+#ifdef sun
+ if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
+ EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ return;
+ }
+
+ if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
+ SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ (void) sysevent_evc_unbind(error_chan);
+ return;
+ }
+ (void) sysevent_evc_unbind(error_chan);
+#else
+ (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
+#endif
+}
+
+/*
+ * Wrapppers for FM nvlist allocators
+ */
+/* ARGSUSED */
+static void *
+i_fm_alloc(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_zalloc(size, KM_SLEEP));
+}
+
+/* ARGSUSED */
+static void
+i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t fm_mem_alloc_ops = {
+ NULL,
+ NULL,
+ i_fm_alloc,
+ i_fm_free,
+ NULL
+};
+
+/*
+ * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
+ * to the newly allocated nv_alloc_t structure is returned upon success or NULL
+ * is returned to indicate that the nv_alloc structure could not be created.
+ */
+nv_alloc_t *
+fm_nva_xcreate(char *buf, size_t bufsz)
+{
+ nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+
+ return (nvhdl);
+}
+
+/*
+ * Destroy a previously allocated nv_alloc structure. The fixed buffer
+ * associated with nva must be freed by the caller.
+ */
+void
+fm_nva_xdestroy(nv_alloc_t *nva)
+{
+ nv_alloc_fini(nva);
+ kmem_free(nva, sizeof (nv_alloc_t));
+}
+
+/*
+ * Create a new nv list. A pointer to a new nv list structure is returned
+ * upon success or NULL is returned to indicate that the structure could
+ * not be created. The newly created nv list is created and managed by the
+ * operations installed in nva. If nva is NULL, the default FMA nva
+ * operations are installed and used.
+ *
+ * When called from the kernel and nva == NULL, this function must be called
+ * from passive kernel context with no locks held that can prevent a
+ * sleeping memory allocation from occurring. Otherwise, this function may
+ * be called from other kernel contexts as long a valid nva created via
+ * fm_nva_create() is supplied.
+ */
+nvlist_t *
+fm_nvlist_create(nv_alloc_t *nva)
+{
+ int hdl_alloced = 0;
+ nvlist_t *nvl;
+ nv_alloc_t *nvhdl;
+
+ if (nva == NULL) {
+ nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+ hdl_alloced = 1;
+ } else {
+ nvhdl = nva;
+ }
+
+ if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
+ if (hdl_alloced) {
+ nv_alloc_fini(nvhdl);
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ }
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Destroy a previously allocated nvlist structure. flag indicates whether
+ * or not the associated nva structure should be freed (FM_NVA_FREE) or
+ * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
+ * it to be re-used for future nvlist creation operations.
+ */
+void
+fm_nvlist_destroy(nvlist_t *nvl, int flag)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
+
+ nvlist_free(nvl);
+
+ if (nva != NULL) {
+ if (flag == FM_NVA_FREE)
+ fm_nva_xdestroy(nva);
+ }
+}
+
+int
+i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
+{
+ int nelem, ret = 0;
+ data_type_t type;
+
+ while (ret == 0 && name != NULL) {
+ type = va_arg(ap, data_type_t);
+ switch (type) {
+ case DATA_TYPE_BYTE:
+ ret = nvlist_add_byte(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_byte_array(payload, name,
+ va_arg(ap, uchar_t *), nelem);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ ret = nvlist_add_boolean_value(payload, name,
+ va_arg(ap, boolean_t));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_boolean_array(payload, name,
+ va_arg(ap, boolean_t *), nelem);
+ break;
+ case DATA_TYPE_INT8:
+ ret = nvlist_add_int8(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int8_array(payload, name,
+ va_arg(ap, int8_t *), nelem);
+ break;
+ case DATA_TYPE_UINT8:
+ ret = nvlist_add_uint8(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint8_array(payload, name,
+ va_arg(ap, uint8_t *), nelem);
+ break;
+ case DATA_TYPE_INT16:
+ ret = nvlist_add_int16(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int16_array(payload, name,
+ va_arg(ap, int16_t *), nelem);
+ break;
+ case DATA_TYPE_UINT16:
+ ret = nvlist_add_uint16(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint16_array(payload, name,
+ va_arg(ap, uint16_t *), nelem);
+ break;
+ case DATA_TYPE_INT32:
+ ret = nvlist_add_int32(payload, name,
+ va_arg(ap, int32_t));
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int32_array(payload, name,
+ va_arg(ap, int32_t *), nelem);
+ break;
+ case DATA_TYPE_UINT32:
+ ret = nvlist_add_uint32(payload, name,
+ va_arg(ap, uint32_t));
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint32_array(payload, name,
+ va_arg(ap, uint32_t *), nelem);
+ break;
+ case DATA_TYPE_INT64:
+ ret = nvlist_add_int64(payload, name,
+ va_arg(ap, int64_t));
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int64_array(payload, name,
+ va_arg(ap, int64_t *), nelem);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvlist_add_uint64(payload, name,
+ va_arg(ap, uint64_t));
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint64_array(payload, name,
+ va_arg(ap, uint64_t *), nelem);
+ break;
+ case DATA_TYPE_STRING:
+ ret = nvlist_add_string(payload, name,
+ va_arg(ap, char *));
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_string_array(payload, name,
+ va_arg(ap, char **), nelem);
+ break;
+ case DATA_TYPE_NVLIST:
+ ret = nvlist_add_nvlist(payload, name,
+ va_arg(ap, nvlist_t *));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_nvlist_array(payload, name,
+ va_arg(ap, nvlist_t **), nelem);
+ break;
+ default:
+ ret = EINVAL;
+ }
+
+ name = va_arg(ap, char *);
+ }
+ return (ret);
+}
+
+void
+fm_payload_set(nvlist_t *payload, ...)
+{
+ int ret;
+ const char *name;
+ va_list ap;
+
+ va_start(ap, payload);
+ name = va_arg(ap, char *);
+ ret = i_fm_payload_set(payload, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_add_64(
+ &erpt_kstat_data.payload_set_failed.value.ui64, 1);
+}
+
+/*
+ * Set-up and validate the members of an ereport event according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * class string ereport
+ * version uint8_t 0
+ * ena uint64_t <ena>
+ * detector nvlist_t <detector>
+ * ereport-payload nvlist_t <var args>
+ *
+ * We don't actually add a 'version' member to the payload. Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else. Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
+ */
+void
+fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
+ uint64_t ena, const nvlist_t *detector, ...)
+{
+ char ereport_class[FM_MAX_CLASS];
+ const char *name;
+ va_list ap;
+ int ret;
+
+ if (version != FM_EREPORT_VERS0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ return;
+ }
+
+ (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
+ FM_EREPORT_CLASS, erpt_class);
+ if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ }
+
+ if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
+ (nvlist_t *)detector) != 0) {
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ }
+
+ va_start(ap, detector);
+ name = va_arg(ap, const char *);
+ ret = i_fm_payload_set(ereport, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+}
+
+/*
+ * Set-up and validate the members of an hc fmri according to;
+ *
+ * Member name Type Value
+ * ===================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * hc-name string <name>
+ * hc-id string <id>
+ *
+ * Note that auth and hc-id are optional members.
+ */
+
+#define HC_MAXPAIRS 20
+#define HC_MAXNAMELEN 50
+
+static int
+fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
+{
+ if (version != FM_HC_SCHEME_VERSION) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
+ nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return (0);
+ }
+
+ return (1);
+}
+
+void
+fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ va_list ap;
+ int i;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = 0; i < npairs; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+ va_end(ap);
+
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
+ for (i = 0; i < npairs; i++)
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+}
+
+/*
+ * Set-up and validate the members of an dev fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * devpath string <devpath>
+ * [devid] string <devid>
+ * [target-port-l0id] string <target-port-lun0-id>
+ *
+ * Note that auth and devid are optional members.
+ */
+void
+fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
+ const char *devpath, const char *devid, const char *tpl0)
+{
+ int err = 0;
+
+ if (version != DEV_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
+
+ if (auth != NULL) {
+ err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth);
+ }
+
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
+
+ if (devid != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+ if (tpl0 != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+ if (err)
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
+}
+
+/*
+ * Set-up and validate the members of an cpu fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * cpuid uint32_t <cpu_id>
+ * cpumask uint8_t <cpu_mask>
+ * serial uint64_t <serial_id>
+ *
+ * Note that auth, cpumask, serial are optional members.
+ *
+ */
+void
+fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
+ uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
+{
+ uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
+
+ if (version < CPU_SCHEME_VERSION1) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
+ FM_FMRI_SCHEME_CPU) != 0) {
+ atomic_add_64(failedp, 1);
+ return;
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
+ *cpu_maskp) != 0)
+ atomic_add_64(failedp, 1);
+
+ if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
+ FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
+ atomic_add_64(failedp, 1);
+}
+
+/*
+ * Set-up and validate the members of a mem according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth> [optional]
+ * unum string <unum>
+ * serial string <serial> [optional*]
+ * offset uint64_t <offset> [optional]
+ *
+ * * serial is required if offset is present
+ */
+void
+fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ const char *unum, const char *serial, uint64_t offset)
+{
+ if (version != MEM_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (!serial && (offset != (uint64_t)-1)) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (auth != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+
+ if (serial != NULL) {
+ if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
+ (char **)&serial, 1) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ if (offset != (uint64_t)-1) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET,
+ offset) != 0) {
+ atomic_add_64(&erpt_kstat_data.
+ fmri_set_failed.value.ui64, 1);
+ }
+ }
+ }
+}
+
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+ uint64_t vdev_guid)
+{
+ if (version != ZFS_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+
+ if (vdev_guid != 0) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+}
+
+uint64_t
+fm_ena_increment(uint64_t ena)
+{
+ uint64_t new_ena;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
+ break;
+ case FM_ENA_FMT2:
+ new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
+ break;
+ default:
+ new_ena = 0;
+ }
+
+ return (new_ena);
+}
+
+uint64_t
+fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
+{
+ uint64_t ena = 0;
+
+ switch (format) {
+ case FM_ENA_FMT1:
+ if (timestamp) {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((timestamp << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ } else {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ }
+ break;
+ case FM_ENA_FMT2:
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
+ break;
+ default:
+ break;
+ }
+
+ return (ena);
+}
+
+uint64_t
+fm_ena_generate(uint64_t timestamp, uchar_t format)
+{
+ return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
+}
+
+uint64_t
+fm_ena_generation_get(uint64_t ena)
+{
+ uint64_t gen;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
+ break;
+ default:
+ gen = 0;
+ break;
+ }
+
+ return (gen);
+}
+
+uchar_t
+fm_ena_format_get(uint64_t ena)
+{
+
+ return (ENA_FORMAT(ena));
+}
+
+uint64_t
+fm_ena_id_get(uint64_t ena)
+{
+ uint64_t id;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
+ break;
+ default:
+ id = 0;
+ }
+
+ return (id);
+}
+
+uint64_t
+fm_ena_time_get(uint64_t ena)
+{
+ uint64_t time;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
+ break;
+ default:
+ time = 0;
+ }
+
+ return (time);
+}
+
+#ifdef sun
+/*
+ * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
+ * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
+ */
+void
+fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
+{
+ int i;
+ char *sym;
+ ulong_t off;
+ char *stkpp[FM_STK_DEPTH];
+ char buf[FM_STK_DEPTH * FM_SYM_SZ];
+ char *stkp = buf;
+
+ for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
+ if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
+ (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
+ else
+ (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
+ stkpp[i] = stkp;
+ }
+
+ fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
+ DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
+}
+#endif
+
+#ifdef sun
+void
+print_msg_hwerr(ctid_t ct_id, proc_t *p)
+{
+ uprintf("Killed process %d (%s) in contract id %d "
+ "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
+}
+#endif
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ nvlist_t **hcl;
+ uint_t n;
+ int i, j;
+ va_list ap;
+ char *hcname, *hcid;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ /*
+ * copy the bboard nvpairs to the pairs array
+ */
+ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+ != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+ &hcname) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+
+ /*
+ * create the pairs from passed in pairs
+ */
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = n; i < npairs + n; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+ va_end(ap);
+
+ /*
+ * Create the fmri hc list
+ */
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+ npairs + n) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ for (i = 0; i < npairs + n; i++) {
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+ }
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
index b0ec0639781a..991978e41d19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
@@ -37,7 +37,7 @@
#undef _SYS_ACL_H
#include_next <sys/acl.h>
#define _SYS_ACL_H
-#endif /* _KERNEL */
+#endif /* _KERNEL */
#ifdef __cplusplus
extern "C" {
@@ -57,7 +57,7 @@ typedef struct ace {
uint16_t a_type; /* allow or deny */
} ace_t;
-#if !defined(_KERNEL)
+#ifndef _KERNEL
typedef struct acl_info acl_t;
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
index 878ddcbeb09f..8718f5bcf63f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
@@ -47,7 +47,7 @@ typedef enum acl_type {
} zfs_acl_type_t;
struct acl_info {
- zfs_acl_type_t acl_type; /* style of acl */
+ zfs_acl_type_t acl_type; /* style of acl */
int acl_cnt; /* number of acl entries */
int acl_entry_size; /* sizeof acl entry */
int acl_flags; /* special flags about acl */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
index 02263a5a0cf1..ba305c908239 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _AVL_H
#define _AVL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This is a private header file. Applications should not directly include
* this file.
@@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree,
* node - node that has the value being looked for
* where - position for use with avl_nearest() or avl_insert(), may be NULL
*/
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
/*
* Insert a node into the tree.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h b/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h
deleted file mode 100644
index a2bab580a54b..000000000000
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#ifndef _SYS_BYTEORDER_H
-#define _SYS_BYTEORDER_H
-
-#include <sys/isa_defs.h>
-#include <sys/int_types.h>
-
-#if defined(__GNUC__) && defined(_ASM_INLINES) && \
- (defined(__i386) || defined(__amd64))
-#include <asm/byteorder.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * macros for conversion between host and (internet) network byte order
- */
-
-#if BYTE_ORDER == _BIG_ENDIAN && !defined(ntohl) && !defined(__lint)
-/* big-endian */
-#define ntohl(x) (x)
-#define ntohs(x) (x)
-#define htonl(x) (x)
-#define htons(x) (x)
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-#define ntohll(x) (x)
-#define htonll(x) (x)
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-
-#elif !defined(ntohl) /* little-endian */
-
-#ifndef _IN_PORT_T
-#define _IN_PORT_T
-typedef uint16_t in_port_t;
-#endif
-
-#ifndef _IN_ADDR_T
-#define _IN_ADDR_T
-typedef uint32_t in_addr_t;
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5)
-extern uint32_t htonl(uint32_t);
-extern uint16_t htons(uint16_t);
-extern uint32_t ntohl(uint32_t);
-extern uint16_t ntohs(uint16_t);
-#else
-extern in_addr_t htonl(in_addr_t);
-extern in_port_t htons(in_port_t);
-extern in_addr_t ntohl(in_addr_t);
-extern in_port_t ntohs(in_port_t);
-#endif /* !_XPG4_2 || __EXTENSIONS__ || _XPG5 */
-
-#if defined(_LP64) || defined(_LONGLONG_TYPE)
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-extern uint64_t htonll(uint64_t);
-extern uint64_t ntohll(uint64_t);
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-#endif /* _LP64 || _LONGLONG_TYPE */
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-
-/*
- * Macros to reverse byte order
- */
-#define BSWAP_8(x) ((x) & 0xff)
-#if !defined(__i386) && !defined(__amd64)
-#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
-#define BSWAP_32(x) (((uint32_t)(x) << 24) | \
- (((uint32_t)(x) << 8) & 0xff0000) | \
- (((uint32_t)(x) >> 8) & 0xff00) | \
- ((uint32_t)(x) >> 24))
-#else /* x86 */
-#define BSWAP_16(x) htons(x)
-#define BSWAP_32(x) htonl(x)
-#endif /* !__i386 && !__amd64 */
-
-#if defined(_LP64) || defined(_LONGLONG_TYPE)
-#if (!defined(__i386) && !defined(__amd64))
-#define BSWAP_64(x) (((uint64_t)(x) << 56) | \
- (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
- (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
- (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
- (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
- (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
- (((uint64_t)(x) >> 40) & 0xff00ULL) | \
- ((uint64_t)(x) >> 56))
-#else /* x86 */
-#define BSWAP_64(x) htonll(x)
-#endif /* !__i386 && !__amd64 */
-#else /* no uint64_t */
-#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
-#endif /* _LP64 || _LONGLONG_TYPE */
-
-#define BMASK_8(x) ((x) & 0xff)
-#define BMASK_16(x) ((x) & 0xffff)
-#define BMASK_32(x) ((x) & 0xffffffff)
-#define BMASK_64(x) (x)
-
-/*
- * Macros to convert from a specific byte order to/from native byte order
- */
-#if BYTE_ORDER == _BIG_ENDIAN
-#define BE_8(x) BMASK_8(x)
-#define BE_16(x) BMASK_16(x)
-#define BE_32(x) BMASK_32(x)
-#define BE_64(x) BMASK_64(x)
-#define LE_8(x) BSWAP_8(x)
-#define LE_16(x) BSWAP_16(x)
-#define LE_32(x) BSWAP_32(x)
-#define LE_64(x) BSWAP_64(x)
-#else
-#define LE_8(x) BMASK_8(x)
-#define LE_16(x) BMASK_16(x)
-#define LE_32(x) BMASK_32(x)
-#define LE_64(x) BMASK_64(x)
-#define BE_8(x) BSWAP_8(x)
-#define BE_16(x) BSWAP_16(x)
-#define BE_32(x) BSWAP_32(x)
-#define BE_64(x) BSWAP_64(x)
-#endif
-
-#endif /* !_XPG4_2 || __EXTENSIONS__ */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_BYTEORDER_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
index 2b9ac3f98388..43f14ebb369d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_CALLB_H
#define _SYS_CALLB_H
-#pragma ident "@(#)callb.h 1.29 05/06/23 SMI"
-
#include <sys/kcondvar.h>
#ifdef __cplusplus
@@ -68,7 +65,8 @@ extern "C" {
#define CB_CL_MDBOOT CB_CL_UADMIN
#define CB_CL_ENTER_DEBUGGER 14
#define CB_CL_CPR_POST_KERNEL 15
-#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
+#define CB_CL_CPU_DEEP_IDLE 16
+#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */
/*
* CB_CL_CPR_DAEMON class specific definitions are given below:
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
index b9e0da4e1993..b44dda5e8418 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_CPUPART_H
#define _SYS_CPUPART_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/processor.h>
#include <sys/cpuvar.h>
@@ -58,16 +55,6 @@ typedef int cpupartid_t;
#define CP_ALL 0 /* return all cpu partitions */
#define CP_NONEMPTY 1 /* return only non-empty ones */
-#if defined(_MACHDEP)
-struct mach_cpupart {
- cpuset_t mc_haltset;
-};
-
-extern struct mach_cpupart cp_default_mach;
-#else
-struct mach_cpupart;
-#endif
-
typedef struct cpupart {
disp_t cp_kp_queue; /* partition-wide kpreempt queue */
cpupartid_t cp_id; /* partition ID */
@@ -103,8 +90,7 @@ typedef struct cpupart {
lgrp_gen_t cp_gen; /* generation number */
lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */
bitset_t cp_cmt_pgs; /* CMT PGs represented */
-
- struct mach_cpupart *cp_mach; /* mach-specific */
+ bitset_t cp_haltset; /* halted CPUs */
} cpupart_t;
typedef struct cpupart_kstat {
@@ -138,6 +124,15 @@ extern cpupart_t *cp_list_head;
extern uint_t cp_numparts;
extern uint_t cp_numparts_nonempty;
+/*
+ * Each partition contains a bitset that indicates which CPUs are halted and
+ * which ones are running. Given the growing number of CPUs in current and
+ * future platforms, it's important to fanout each CPU within its partition's
+ * haltset to prevent contention due to false sharing. The fanout factor
+ * is platform specific, and declared accordingly.
+ */
+extern uint_t cp_haltset_fanout;
+
extern void cpupart_initialize_default();
extern cpupart_t *cpupart_find(psetid_t);
extern int cpupart_create(psetid_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
index 0a038e00d0e4..d4075d580be7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_CPUVAR_H
@@ -168,7 +167,7 @@ typedef struct cpu {
ftrace_data_t cpu_ftrace; /* per cpu ftrace data */
- clock_t cpu_deadman_lbolt; /* used by deadman() */
+ clock_t cpu_deadman_counter; /* used by deadman() */
uint_t cpu_deadman_countdown; /* used by deadman() */
kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */
@@ -211,12 +210,27 @@ typedef struct cpu {
uint64_t cpu_curr_clock; /* current clock freq in Hz */
char *cpu_supp_freqs; /* supported freqs in Hz */
+ uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */
+ uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */
+
/*
* Interrupt load factor used by dispatcher & softcall
*/
hrtime_t cpu_intrlast; /* total interrupt time (nsec) */
int cpu_intrload; /* interrupt load factor (0-99%) */
+ uint_t cpu_rotor; /* for cheap pseudo-random numbers */
+
+ struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */
+
+ /*
+ * cpu_generation is updated whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation are protected by cpu_lock.
+ *
+ * See CPU_NEW_GENERATION() macro below.
+ */
+ volatile uint_t cpu_generation; /* tracking on/off-line */
+
/*
* New members must be added /before/ this member, as the CTF tools
* rely on this being the last field before cpu_m, so they can
@@ -238,12 +252,13 @@ typedef struct cpu {
* is up to the platform to assure that this is performed properly. Note that
* the structure is sized to avoid false sharing.
*/
-#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uintptr_t) + \
- sizeof (kmutex_t))
+#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \
+ sizeof (uintptr_t) + sizeof (kmutex_t))
#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
typedef struct cpu_core {
uint16_t cpuc_dtrace_flags; /* DTrace flags */
+ uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */
uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */
uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */
kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */
@@ -261,6 +276,28 @@ extern cpu_core_t cpu_core[];
*/
#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
+/*
+ * Check to see if an interrupt thread might be active at a given ipl.
+ * If so return true.
+ * We must be conservative--it is ok to give a false yes, but a false no
+ * will cause disaster. (But if the situation changes after we check it is
+ * ok--the caller is trying to ensure that an interrupt routine has been
+ * exited).
+ * This is used when trying to remove an interrupt handler from an autovector
+ * list in avintr.c.
+ */
+#define INTR_ACTIVE(cpup, level) \
+ ((level) <= LOCK_LEVEL ? \
+ ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
+
+/*
+ * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
+ * looks at it. It's meant as a cheap mechanism to be incorporated in routines
+ * wanting to avoid biasing, but where true randomness isn't needed (just
+ * something that changes).
+ */
+#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
+
#if defined(_KERNEL) || defined(_KMEMUSER)
#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE)
@@ -352,7 +389,6 @@ extern cpu_core_t cpu_core[];
#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */
#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
-
#endif /* _KERNEL || _KMEMUSER */
#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
@@ -516,6 +552,7 @@ extern cpuset_t cpu_seqid_inuse;
#if defined(_KERNEL) || defined(_KMEMUSER)
extern struct cpu *cpu[]; /* indexed by CPU number */
+extern struct cpu **cpu_seq; /* indexed by sequential CPU id */
extern cpu_t *cpu_list; /* list of CPUs */
extern cpu_t *cpu_active; /* list of active CPUs */
extern int ncpus; /* number of CPUs present */
@@ -526,6 +563,7 @@ extern int boot_ncpus; /* # cpus present @ boot */
extern processorid_t max_cpuid; /* maximum CPU number */
extern struct cpu *cpu_inmotion; /* offline or partition move target */
extern cpu_t *clock_cpu_list;
+extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */
#if defined(__i386) || defined(__amd64)
extern struct cpu *curcpup(void);
@@ -569,6 +607,13 @@ extern struct cpu *curcpup(void);
#define CPU_STATS(cp, stat) \
((cp)->cpu_stats.stat)
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++)
+
#endif /* _KERNEL || _KMEMUSER */
/*
@@ -658,6 +703,7 @@ int cpu_get_state(cpu_t *); /* get current cpu state */
const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */
+void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */
void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
/* frequencies */
@@ -697,6 +743,49 @@ void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
*/
extern kmutex_t cpu_lock; /* lock protecting CPU data */
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ * Event, described by cpu_setup_t
+ * CPU ID
+ * Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ * CPU_INIT A new CPU is started and added to the list of active CPUs
+ * This event is only used during boot
+ *
+ * CPU_CONFIG A newly inserted CPU is prepared for starting running code
+ * This event is called by DR code
+ *
+ * CPU_UNCONFIG CPU has been powered off and needs cleanup
+ * This event is called by DR code
+ *
+ * CPU_ON CPU is enabled but does not run anything yet
+ *
+ * CPU_INTR_ON CPU is enabled and has interrupts enabled
+ *
+ * CPU_OFF CPU is going offline but can still run threads
+ *
+ * CPU_CPUPART_OUT CPU is going to move out of its partition
+ *
+ * CPU_CPUPART_IN CPU is going to move to a new partition
+ *
+ * CPU_SETUP CPU is set up during boot and can run threads
+ */
typedef enum {
CPU_INIT,
CPU_CONFIG,
@@ -704,7 +793,9 @@ typedef enum {
CPU_ON,
CPU_OFF,
CPU_CPUPART_IN,
- CPU_CPUPART_OUT
+ CPU_CPUPART_OUT,
+ CPU_SETUP,
+ CPU_INTR_ON
} cpu_setup_t;
typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
@@ -718,6 +809,13 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
extern void cpu_state_change_notify(int, cpu_setup_t);
/*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
+/*
* Create various strings that describe the given CPU for the
* processor_info system call and configuration-related kstats.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
index e84f1e04305d..5056f9a51105 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,8 +34,6 @@
#ifndef _SYS_CRED_H
#define _SYS_CRED_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#ifdef __cplusplus
@@ -58,6 +56,7 @@ struct prcred;
struct ksid;
struct ksidlist;
struct credklpd;
+struct credgrp;
struct auditinfo_addr; /* cred.h is included in audit.h */
@@ -79,6 +78,7 @@ extern cred_t *crdup(cred_t *);
extern void crdup_to(cred_t *, cred_t *);
extern cred_t *crgetcred(void);
extern void crset(struct proc *, cred_t *);
+extern void crset_zone_privall(cred_t *);
extern int groupmember(gid_t, const cred_t *);
extern int supgroupmember(gid_t, const cred_t *);
extern int hasprocperm(const cred_t *, const cred_t *);
@@ -104,6 +104,7 @@ extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
extern uint_t crgetref(const cred_t *);
extern const gid_t *crgetgroups(const cred_t *);
+extern const gid_t *crgetggroups(const struct credgrp *);
extern int crgetngroups(const cred_t *);
@@ -120,7 +121,13 @@ extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
*/
extern int crsetugid(cred_t *, uid_t, gid_t);
+/*
+ * Functions to handle the supplemental group list.
+ */
extern int crsetgroups(cred_t *, int, gid_t *);
+extern struct credgrp *crgrpcopyin(int, gid_t *);
+extern void crgrprele(struct credgrp *);
+extern void crsetcredgrp(cred_t *, struct credgrp *);
/*
* Private interface for setting zone association of credential.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
index 5fabb14a290e..6467781ce806 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
@@ -19,18 +19,18 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
-
#ifndef _SYS_DEBUG_H
#define _SYS_DEBUG_H
#include <sys/types.h>
+#include <sys/note.h>
#ifdef __cplusplus
extern "C" {
@@ -73,6 +73,25 @@ extern int assfail();
#endif
/*
+ * IMPLY and EQUIV are assertions of the form:
+ *
+ * if (a) then (b)
+ * and
+ * if (a) then (b) *AND* if (b) then (a)
+ */
+#ifdef DEBUG
+#define IMPLY(A, B) \
+ ((void)(((!(A)) || (B)) || \
+ assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__)))
+#define EQUIV(A, B) \
+ ((void)((!!(A) == !!(B)) || \
+ assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__)))
+#else
+#define IMPLY(A, B) ((void)0)
+#define EQUIV(A, B) ((void)0)
+#endif
+
+/*
* ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
* and prints out the values of the left and right hand expressions as part of
* the panic message to ease debugging. The three variants imply the type
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
index 21b7dbe52c11..c752edc99bbd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
@@ -68,6 +68,18 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
#define FM_EREPORT_FAILMODE_WAIT "wait"
#define FM_EREPORT_FAILMODE_CONTINUE "continue"
@@ -75,6 +87,7 @@ extern "C" {
#define FM_RESOURCE_REMOVED "removed"
#define FM_RESOURCE_AUTOREPLACE "autoreplace"
+#define FM_RESOURCE_STATECHANGE "statechange"
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
index 20c07890fad4..f5f93421bd74 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FM_PROTOCOL_H
@@ -43,11 +42,13 @@ extern "C" {
#define FM_CLASS "class"
#define FM_VERSION "version"
-/* FM event class values */
+/* FM protocol category 1 class names */
#define FM_EREPORT_CLASS "ereport"
#define FM_FAULT_CLASS "fault"
+#define FM_DEFECT_CLASS "defect"
#define FM_RSRC_CLASS "resource"
#define FM_LIST_EVENT "list"
+#define FM_IREPORT_CLASS "ireport"
/* FM list.* event class values */
#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
@@ -71,6 +72,12 @@ extern "C" {
/* list.* event payload member names */
#define FM_LIST_EVENT_SIZE "list-sz"
+/* ireport.* event payload member names */
+#define FM_IREPORT_DETECTOR "detector"
+#define FM_IREPORT_UUID "uuid"
+#define FM_IREPORT_PRIORITY "pri"
+#define FM_IREPORT_ATTRIBUTES "attr"
+
/*
* list.suspect, isolated, updated, repaired and resolved
* versions/payload member names.
@@ -82,9 +89,11 @@ extern "C" {
#define FM_SUSPECT_FAULT_LIST "fault-list"
#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
#define FM_SUSPECT_FAULT_STATUS "fault-status"
+#define FM_SUSPECT_INJECTED "__injected"
#define FM_SUSPECT_MESSAGE "message"
#define FM_SUSPECT_RETIRE "retire"
#define FM_SUSPECT_RESPONSE "response"
+#define FM_SUSPECT_SEVERITY "severity"
#define FM_SUSPECT_VERS0 0
#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
@@ -120,6 +129,7 @@ extern "C" {
#define FM_RSRC_ASRU_REPAIRED "repaired"
#define FM_RSRC_ASRU_REPLACED "replaced"
#define FM_RSRC_ASRU_ACQUITTED "acquitted"
+#define FM_RSRC_ASRU_RESOLVED "resolved"
#define FM_RSRC_ASRU_UNUSABLE "unusable"
#define FM_RSRC_ASRU_EVENT "event"
@@ -128,6 +138,8 @@ extern "C" {
#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
#define FM_RSRC_XPRT_UUID "uuid"
#define FM_RSRC_XPRT_SUBCLASS "subclass"
+#define FM_RSRC_XPRT_FAULT_STATUS "fault-status"
+#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru"
/*
* FM ENA Format Macros
@@ -166,6 +178,7 @@ extern "C" {
/* FMRI authority-type member names */
#define FM_FMRI_AUTH_CHASSIS "chassis-id"
+#define FM_FMRI_AUTH_PRODUCT_SN "product-sn"
#define FM_FMRI_AUTH_PRODUCT "product-id"
#define FM_FMRI_AUTH_DOMAIN "domain-id"
#define FM_FMRI_AUTH_SERVER "server-id"
@@ -185,6 +198,7 @@ extern "C" {
#define FM_FMRI_SCHEME_PKG "pkg"
#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
#define FM_FMRI_SCHEME_ZFS "zfs"
+#define FM_FMRI_SCHEME_SW "sw"
/* Scheme versions */
#define FMD_SCHEME_VERSION0 0
@@ -204,8 +218,12 @@ extern "C" {
#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
#define LEGACY_SCHEME_VERSION0 0
#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
+#define SVC_SCHEME_VERSION0 0
+#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0
#define ZFS_SCHEME_VERSION0 0
#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
+#define SW_SCHEME_VERSION0 0
+#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0
/* hc scheme member names */
#define FM_FMRI_HC_SERIAL_ID "serial"
@@ -237,6 +255,7 @@ extern "C" {
/* dev scheme member names */
#define FM_FMRI_DEV_ID "devid"
+#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id"
#define FM_FMRI_DEV_PATH "device-path"
/* pkg scheme member names */
@@ -245,14 +264,13 @@ extern "C" {
#define FM_FMRI_PKG_VERSION "pkg-version"
/* svc scheme member names */
-#define FM_FMRI_SVC_NAME "service-name"
-#define FM_FMRI_SVC_VERSION "service-version"
-#define FM_FMRI_SVC_INSTANCE "instance"
-#define FM_FMRI_SVC_CONTRACT_ID "contract-id"
+#define FM_FMRI_SVC_NAME "svc-name"
+#define FM_FMRI_SVC_INSTANCE "svc-instance"
+#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id"
/* svc-authority member names */
#define FM_FMRI_SVC_AUTH_SCOPE "scope"
-#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN"
+#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn"
/* cpu scheme member names */
#define FM_FMRI_CPU_ID "cpuid"
@@ -290,6 +308,25 @@ extern "C" {
#define FM_FMRI_ZFS_POOL "pool"
#define FM_FMRI_ZFS_VDEV "vdev"
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define FM_FMRI_SW_OBJ "object"
+#define FM_FMRI_SW_OBJ_PATH "path"
+#define FM_FMRI_SW_OBJ_ROOT "root"
+#define FM_FMRI_SW_OBJ_PKG "pkg"
+#define FM_FMRI_SW_SITE "site"
+#define FM_FMRI_SW_SITE_TOKEN "token"
+#define FM_FMRI_SW_SITE_MODULE "module"
+#define FM_FMRI_SW_SITE_FILE "file"
+#define FM_FMRI_SW_SITE_LINE "line"
+#define FM_FMRI_SW_SITE_FUNC "func"
+#define FM_FMRI_SW_CTXT "context"
+#define FM_FMRI_SW_CTXT_ORIGIN "origin"
+#define FM_FMRI_SW_CTXT_EXECNAME "execname"
+#define FM_FMRI_SW_CTXT_PID "pid"
+#define FM_FMRI_SW_CTXT_ZONE "zone"
+#define FM_FMRI_SW_CTXT_CTID "ctid"
+#define FM_FMRI_SW_CTXT_STACK "stack"
+
extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
extern void fm_nva_xdestroy(nv_alloc_t *);
@@ -306,7 +343,7 @@ extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
int, ...);
extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
- const char *);
+ const char *, const char *);
extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
uint8_t *, const char *);
@@ -315,6 +352,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
const char *, const char *);
extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+ nvlist_t *, int, ...);
extern uint64_t fm_ena_increment(uint64_t);
extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
index cd176f008d3f..7d7b94977afa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FM_UTIL_H
#define _SYS_FM_UTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -94,6 +91,7 @@ extern void fm_banner(void);
extern void fm_ereport_dump(void);
extern void fm_ereport_post(nvlist_t *, int);
+extern int is_fm_panic();
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index 8400dc1e93c4..edc26cde394d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -18,11 +18,13 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/* Portions Copyright 2010 Robert Milkowski */
+
#ifndef _SYS_FS_ZFS_H
#define _SYS_FS_ZFS_H
@@ -52,6 +54,10 @@ typedef enum {
#define ZFS_TYPE_DATASET \
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN (1024 * 8)
+#define ZAP_OLDMAXVALUELEN 1024
+
/*
* Dataset properties are identified by these constants and must be added to
* the end of this list to ensure that external consumers are not affected
@@ -83,12 +89,11 @@ typedef enum {
ZFS_PROP_READONLY,
ZFS_PROP_ZONED,
ZFS_PROP_SNAPDIR,
- ZFS_PROP_ACLMODE,
+ ZFS_PROP_PRIVATE, /* not exposed to user, temporary */
ZFS_PROP_ACLINHERIT,
ZFS_PROP_CREATETXG, /* not exposed to the user */
ZFS_PROP_NAME, /* not exposed to the user */
ZFS_PROP_CANMOUNT,
- ZFS_PROP_SHAREISCSI,
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
ZFS_PROP_XATTR,
ZFS_PROP_NUMCLONES, /* not exposed to the user */
@@ -110,6 +115,15 @@ typedef enum {
ZFS_PROP_USEDCHILD,
ZFS_PROP_USEDREFRESERV,
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
+ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
+ ZFS_PROP_DEFER_DESTROY,
+ ZFS_PROP_USERREFS,
+ ZFS_PROP_LOGBIAS,
+ ZFS_PROP_UNIQUE, /* not exposed to the user */
+ ZFS_PROP_OBJSETID, /* not exposed to the user */
+ ZFS_PROP_DEDUP,
+ ZFS_PROP_MLSLABEL,
+ ZFS_PROP_SYNC,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -132,8 +146,6 @@ extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
typedef enum {
ZPOOL_PROP_NAME,
ZPOOL_PROP_SIZE,
- ZPOOL_PROP_USED,
- ZPOOL_PROP_AVAILABLE,
ZPOOL_PROP_CAPACITY,
ZPOOL_PROP_ALTROOT,
ZPOOL_PROP_HEALTH,
@@ -145,6 +157,12 @@ typedef enum {
ZPOOL_PROP_CACHEFILE,
ZPOOL_PROP_FAILUREMODE,
ZPOOL_PROP_LISTSNAPS,
+ ZPOOL_PROP_AUTOEXPAND,
+ ZPOOL_PROP_DEDUPDITTO,
+ ZPOOL_PROP_DEDUPRATIO,
+ ZPOOL_PROP_FREE,
+ ZPOOL_PROP_ALLOCATED,
+ ZPOOL_PROP_READONLY,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -159,10 +177,27 @@ typedef enum {
ZPROP_SRC_DEFAULT = 0x2,
ZPROP_SRC_TEMPORARY = 0x4,
ZPROP_SRC_LOCAL = 0x8,
- ZPROP_SRC_INHERITED = 0x10
+ ZPROP_SRC_INHERITED = 0x10,
+ ZPROP_SRC_RECEIVED = 0x20
} zprop_source_t;
-#define ZPROP_SRC_ALL 0x1f
+#define ZPROP_SRC_ALL 0x3f
+
+#define ZPROP_SOURCE_VAL_RECVD "$recvd"
+#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define ZPROP_HAS_RECVD "$hasrecvd"
+
+typedef enum {
+ ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;
typedef int (*zprop_func)(int, void *);
@@ -184,9 +219,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
const char *zfs_prop_to_name(zfs_prop_t);
zfs_prop_t zfs_name_to_prop(const char *);
boolean_t zfs_prop_user(const char *);
-boolean_t zfs_prop_userquota(const char *name);
+boolean_t zfs_prop_userquota(const char *);
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
/*
@@ -199,6 +235,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t);
boolean_t zpool_prop_readonly(zpool_prop_t);
int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
/*
* Definitions for the Delegation.
@@ -229,6 +266,8 @@ typedef enum {
#define ZFS_DELEG_PERM_GID "gid"
#define ZFS_DELEG_PERM_GROUPS "groups"
+#define ZFS_MLSLABEL_DEFAULT "none"
+
#define ZFS_SMB_ACL_SRC "src"
#define ZFS_SMB_ACL_TARGET "target"
@@ -238,6 +277,11 @@ typedef enum {
ZFS_CANMOUNT_NOAUTO = 2
} zfs_canmount_type_t;
+typedef enum {
+ ZFS_LOGBIAS_LATENCY = 0,
+ ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
typedef enum zfs_share_op {
ZFS_SHARE_NFS = 0,
ZFS_UNSHARE_NFS = 1,
@@ -258,6 +302,12 @@ typedef enum zfs_cache_type {
ZFS_CACHE_ALL = 2
} zfs_cache_type_t;
+typedef enum {
+ ZFS_SYNC_STANDARD = 0,
+ ZFS_SYNC_ALWAYS = 1,
+ ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+
/*
* On-disk version number.
@@ -277,14 +327,28 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_13 13ULL
#define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
+#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
+#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
+#define SPA_VERSION_27 27ULL
+#define SPA_VERSION_28 28ULL
+
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
- * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_15
-#define SPA_VERSION_STRING "15"
+#define SPA_VERSION SPA_VERSION_28
+#define SPA_VERSION_STRING "28"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -300,8 +364,8 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_INITIAL SPA_VERSION_1
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
#define SPA_VERSION_SPARES SPA_VERSION_3
-#define SPA_VERSION_RAID6 SPA_VERSION_3
-#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
+#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
+#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
@@ -321,6 +385,20 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -328,14 +406,15 @@ typedef enum zfs_cache_type {
* also update the version_table[] and help message in zfs_prop.c.
*
* When changing, be sure to teach GRUB how to read the new format!
- * See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*}
+ * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
*/
#define ZPL_VERSION_1 1ULL
#define ZPL_VERSION_2 2ULL
#define ZPL_VERSION_3 3ULL
#define ZPL_VERSION_4 4ULL
-#define ZPL_VERSION ZPL_VERSION_4
-#define ZPL_VERSION_STRING "4"
+#define ZPL_VERSION_5 5ULL
+#define ZPL_VERSION ZPL_VERSION_5
+#define ZPL_VERSION_STRING "5"
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
@@ -343,6 +422,23 @@ typedef enum zfs_cache_type {
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
+#define ZPL_VERSION_SA ZPL_VERSION_5
+
+/* Rewind request information */
+#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
+#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
+#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
+#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
+#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
+#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
+#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+ uint32_t zrp_request; /* rewind behavior requested */
+ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */
+ uint64_t zrp_maxdata; /* max acceptable data errors */
+ uint64_t zrp_txg; /* specific txg to load */
+} zpool_rewind_policy_t;
/*
* The following are configuration names used in the nvlist describing a pool's
@@ -367,7 +463,8 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_ASHIFT "ashift"
#define ZPOOL_CONFIG_ASIZE "asize"
#define ZPOOL_CONFIG_DTL "DTL"
-#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
@@ -376,13 +473,28 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
+#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
#define ZPOOL_CONFIG_UNSPARE "unspare"
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
+#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
+#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
+#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
+#define ZPOOL_CONFIG_SPLIT "splitcfg"
+#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
+#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
+#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
+#define ZPOOL_CONFIG_REMOVING "removing"
+#define ZPOOL_CONFIG_RESILVERING "resilvering"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
+#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
+#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -393,6 +505,19 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_DEGRADED "degraded"
#define ZPOOL_CONFIG_REMOVED "removed"
#define ZPOOL_CONFIG_FRU "fru"
+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
+
+/* Rewind policy parameters */
+#define ZPOOL_REWIND_POLICY "rewind-policy"
+#define ZPOOL_REWIND_REQUEST "rewind-request"
+#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg"
+#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh"
+#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh"
+
+/* Rewind data discovered */
+#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
+#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
+#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
@@ -401,6 +526,7 @@ typedef enum zfs_cache_type {
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"
@@ -450,7 +576,9 @@ typedef enum vdev_aux {
VDEV_AUX_SPARED, /* hot spare used in another pool */
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
- VDEV_AUX_BAD_LOG /* cannot read log chain(s) */
+ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
+ VDEV_AUX_EXTERNAL, /* external diagnosis */
+ VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */
} vdev_aux_t;
/*
@@ -471,14 +599,14 @@ typedef enum pool_state {
} pool_state_t;
/*
- * Scrub types.
+ * Scan Functions.
*/
-typedef enum pool_scrub_type {
- POOL_SCRUB_NONE,
- POOL_SCRUB_RESILVER,
- POOL_SCRUB_EVERYTHING,
- POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+ POOL_SCAN_NONE,
+ POOL_SCAN_SCRUB,
+ POOL_SCAN_RESILVER,
+ POOL_SCAN_FUNCS
+} pool_scan_func_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -494,6 +622,36 @@ typedef enum zio_type {
} zio_type_t;
/*
+ * Pool statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+ /* values stored on disk */
+ uint64_t pss_func; /* pool_scan_func_t */
+ uint64_t pss_state; /* dsl_scan_state_t */
+ uint64_t pss_start_time; /* scan start time */
+ uint64_t pss_end_time; /* scan end time */
+ uint64_t pss_to_examine; /* total bytes to scan */
+ uint64_t pss_examined; /* total examined bytes */
+ uint64_t pss_to_process; /* total bytes to process */
+ uint64_t pss_processed; /* total processed bytes */
+ uint64_t pss_errors; /* scan errors */
+
+ /* values not stored on disk */
+ uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_start; /* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+ DSS_NONE,
+ DSS_SCANNING,
+ DSS_FINISHED,
+ DSS_CANCELED,
+ DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
+/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -511,34 +669,50 @@ typedef struct vdev_stat {
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
uint64_t vs_self_healed; /* self-healed bytes */
- uint64_t vs_scrub_type; /* pool_scrub_type_t */
- uint64_t vs_scrub_complete; /* completed? */
- uint64_t vs_scrub_examined; /* bytes examined; top */
- uint64_t vs_scrub_repaired; /* bytes repaired; leaf */
- uint64_t vs_scrub_errors; /* errors during scrub */
- uint64_t vs_scrub_start; /* UTC scrub start time */
- uint64_t vs_scrub_end; /* UTC scrub end time */
+ uint64_t vs_scan_removing; /* removing? */
+ uint64_t vs_scan_processed; /* scan processed bytes */
} vdev_stat_t;
+/*
+ * DDT statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+ uint64_t ddo_count; /* number of elments in ddt */
+ uint64_t ddo_dspace; /* size of ddt on disk */
+ uint64_t ddo_mspace; /* size of ddt in-core */
+} ddt_object_t;
+
+typedef struct ddt_stat {
+ uint64_t dds_blocks; /* blocks */
+ uint64_t dds_lsize; /* logical size */
+ uint64_t dds_psize; /* physical size */
+ uint64_t dds_dsize; /* deflated allocated size */
+ uint64_t dds_ref_blocks; /* referenced blocks */
+ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
+ uint64_t dds_ref_psize; /* referenced psize * refcnt */
+ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+ ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
+} ddt_histogram_t;
+
#define ZVOL_DRIVER "zvol"
#define ZFS_DRIVER "zfs"
#define ZFS_DEV_NAME "zfs"
#define ZFS_DEV "/dev/" ZFS_DEV_NAME
-/*
- * zvol paths. Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix. Below are the names without /dev.
- */
-#define ZVOL_DEV_DIR "zvol"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
-#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:"
-#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/"
+/* general zvol path */
+#define ZVOL_DIR "/dev/zvol"
+/* expansion */
+#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
+/* for dump and swap */
+#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
+#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"
#define ZVOL_PROP_NAME "name"
+#define ZVOL_DEFAULT_BLOCKSIZE 8192
/*
* /dev/zfs ioctl numbers.
@@ -554,7 +728,7 @@ typedef unsigned long zfs_ioc_t;
#define ZFS_IOC_POOL_CONFIGS _IOWR('Z', 4, struct zfs_cmd)
#define ZFS_IOC_POOL_STATS _IOWR('Z', 5, struct zfs_cmd)
#define ZFS_IOC_POOL_TRYIMPORT _IOWR('Z', 6, struct zfs_cmd)
-#define ZFS_IOC_POOL_SCRUB _IOWR('Z', 7, struct zfs_cmd)
+#define ZFS_IOC_POOL_SCAN _IOWR('Z', 7, struct zfs_cmd)
#define ZFS_IOC_POOL_FREEZE _IOWR('Z', 8, struct zfs_cmd)
#define ZFS_IOC_POOL_UPGRADE _IOWR('Z', 9, struct zfs_cmd)
#define ZFS_IOC_POOL_GET_HISTORY _IOWR('Z', 10, struct zfs_cmd)
@@ -564,52 +738,60 @@ typedef unsigned long zfs_ioc_t;
#define ZFS_IOC_VDEV_ATTACH _IOWR('Z', 14, struct zfs_cmd)
#define ZFS_IOC_VDEV_DETACH _IOWR('Z', 15, struct zfs_cmd)
#define ZFS_IOC_VDEV_SETPATH _IOWR('Z', 16, struct zfs_cmd)
-#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 17, struct zfs_cmd)
-#define ZFS_IOC_OBJSET_ZPLPROPS _IOWR('Z', 18, struct zfs_cmd)
-#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 19, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
-#define ZFS_IOC_SET_PROP _IOWR('Z', 21, struct zfs_cmd)
-#define ZFS_IOC_CREATE_MINOR _IOWR('Z', 22, struct zfs_cmd)
-#define ZFS_IOC_REMOVE_MINOR _IOWR('Z', 23, struct zfs_cmd)
-#define ZFS_IOC_CREATE _IOWR('Z', 24, struct zfs_cmd)
-#define ZFS_IOC_DESTROY _IOWR('Z', 25, struct zfs_cmd)
-#define ZFS_IOC_ROLLBACK _IOWR('Z', 26, struct zfs_cmd)
-#define ZFS_IOC_RENAME _IOWR('Z', 27, struct zfs_cmd)
-#define ZFS_IOC_RECV _IOWR('Z', 28, struct zfs_cmd)
-#define ZFS_IOC_SEND _IOWR('Z', 29, struct zfs_cmd)
-#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 30, struct zfs_cmd)
-#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 31, struct zfs_cmd)
-#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 32, struct zfs_cmd)
-#define ZFS_IOC_ERROR_LOG _IOWR('Z', 33, struct zfs_cmd)
-#define ZFS_IOC_CLEAR _IOWR('Z', 34, struct zfs_cmd)
-#define ZFS_IOC_PROMOTE _IOWR('Z', 35, struct zfs_cmd)
-#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 36, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT _IOWR('Z', 37, struct zfs_cmd)
-#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 38, struct zfs_cmd)
-#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 39, struct zfs_cmd)
-#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 40, struct zfs_cmd)
-#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 41, struct zfs_cmd)
-#define ZFS_IOC_SET_FSACL _IOWR('Z', 42, struct zfs_cmd)
-#define ZFS_IOC_GET_FSACL _IOWR('Z', 43, struct zfs_cmd)
-#define ZFS_IOC_ISCSI_PERM_CHECK _IOWR('Z', 44, struct zfs_cmd)
-#define ZFS_IOC_SHARE _IOWR('Z', 45, struct zfs_cmd)
-#define ZFS_IOC_INHERIT_PROP _IOWR('Z', 46, struct zfs_cmd)
-#define ZFS_IOC_JAIL _IOWR('Z', 47, struct zfs_cmd)
-#define ZFS_IOC_UNJAIL _IOWR('Z', 48, struct zfs_cmd)
-#define ZFS_IOC_SMB_ACL _IOWR('Z', 49, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_ONE _IOWR('Z', 50, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_MANY _IOWR('Z', 51, struct zfs_cmd)
-#define ZFS_IOC_USERSPACE_UPGRADE _IOWR('Z', 52, struct zfs_cmd)
-#define ZFS_IOC_SETFRU _IOWR('Z', 53, struct zfs_cmd)
+#define ZFS_IOC_VDEV_SETFRU _IOWR('Z', 17, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 18, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_ZPLPROPS _IOWR('Z', 19, struct zfs_cmd)
+#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 21, struct zfs_cmd)
+#define ZFS_IOC_SET_PROP _IOWR('Z', 22, struct zfs_cmd)
+#define ZFS_IOC_CREATE _IOWR('Z', 23, struct zfs_cmd)
+#define ZFS_IOC_DESTROY _IOWR('Z', 24, struct zfs_cmd)
+#define ZFS_IOC_ROLLBACK _IOWR('Z', 25, struct zfs_cmd)
+#define ZFS_IOC_RENAME _IOWR('Z', 26, struct zfs_cmd)
+#define ZFS_IOC_RECV _IOWR('Z', 27, struct zfs_cmd)
+#define ZFS_IOC_SEND _IOWR('Z', 28, struct zfs_cmd)
+#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 29, struct zfs_cmd)
+#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 30, struct zfs_cmd)
+#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 31, struct zfs_cmd)
+#define ZFS_IOC_ERROR_LOG _IOWR('Z', 32, struct zfs_cmd)
+#define ZFS_IOC_CLEAR _IOWR('Z', 33, struct zfs_cmd)
+#define ZFS_IOC_PROMOTE _IOWR('Z', 34, struct zfs_cmd)
+#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 35, struct zfs_cmd)
+#define ZFS_IOC_SNAPSHOT _IOWR('Z', 36, struct zfs_cmd)
+#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 37, struct zfs_cmd)
+#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 38, struct zfs_cmd)
+#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 39, struct zfs_cmd)
+#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 40, struct zfs_cmd)
+#define ZFS_IOC_SET_FSACL _IOWR('Z', 41, struct zfs_cmd)
+#define ZFS_IOC_GET_FSACL _IOWR('Z', 42, struct zfs_cmd)
+#define ZFS_IOC_SHARE _IOWR('Z', 43, struct zfs_cmd)
+#define ZFS_IOC_INHERIT_PROP _IOWR('Z', 44, struct zfs_cmd)
+#define ZFS_IOC_SMB_ACL _IOWR('Z', 45, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_ONE _IOWR('Z', 46, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_MANY _IOWR('Z', 47, struct zfs_cmd)
+#define ZFS_IOC_USERSPACE_UPGRADE _IOWR('Z', 48, struct zfs_cmd)
+#define ZFS_IOC_HOLD _IOWR('Z', 49, struct zfs_cmd)
+#define ZFS_IOC_RELEASE _IOWR('Z', 50, struct zfs_cmd)
+#define ZFS_IOC_GET_HOLDS _IOWR('Z', 51, struct zfs_cmd)
+#define ZFS_IOC_OBJSET_RECVD_PROPS _IOWR('Z', 52, struct zfs_cmd)
+#define ZFS_IOC_VDEV_SPLIT _IOWR('Z', 53, struct zfs_cmd)
+#define ZFS_IOC_NEXT_OBJ _IOWR('Z', 54, struct zfs_cmd)
+#define ZFS_IOC_DIFF _IOWR('Z', 55, struct zfs_cmd)
+#define ZFS_IOC_TMP_SNAPSHOT _IOWR('Z', 56, struct zfs_cmd)
+#define ZFS_IOC_OBJ_TO_STATS _IOWR('Z', 57, struct zfs_cmd)
+#define ZFS_IOC_JAIL _IOWR('Z', 58, struct zfs_cmd)
+#define ZFS_IOC_UNJAIL _IOWR('Z', 59, struct zfs_cmd)
/*
* Internal SPA load state. Used by FMA diagnosis engine.
*/
typedef enum {
- SPA_LOAD_NONE, /* no load in progress */
- SPA_LOAD_OPEN, /* normal open */
- SPA_LOAD_IMPORT, /* import in progress */
- SPA_LOAD_TRYIMPORT /* tryimport in progress */
+ SPA_LOAD_NONE, /* no load in progress */
+ SPA_LOAD_OPEN, /* normal open */
+ SPA_LOAD_IMPORT, /* import in progress */
+ SPA_LOAD_TRYIMPORT, /* tryimport in progress */
+ SPA_LOAD_RECOVER, /* recovery requested */
+ SPA_LOAD_ERROR /* load failed */
} spa_load_state_t;
/*
@@ -641,9 +823,19 @@ typedef enum {
#define ZFS_ONLINE_CHECKREMOVE 0x1
#define ZFS_ONLINE_UNSPARE 0x2
#define ZFS_ONLINE_FORCEFAULT 0x4
+#define ZFS_ONLINE_EXPAND 0x8
#define ZFS_OFFLINE_TEMPORARY 0x1
/*
+ * Flags for ZFS_IOC_POOL_IMPORT
+ */
+#define ZFS_IMPORT_NORMAL 0x0
+#define ZFS_IMPORT_VERBATIM 0x1
+#define ZFS_IMPORT_ANY_HOST 0x2
+#define ZFS_IMPORT_MISSING_LOG 0x4
+#define ZFS_IMPORT_ONLY 0x8
+
+/*
* Sysevent payload members. ZFS will generate the following sysevents with the
* given payloads:
*
@@ -671,7 +863,7 @@ typedef enum {
/*
* Note: This is encoded on-disk, so new events must be added to the
* end, and unused events can not be removed. Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
*/
typedef enum history_internal_events {
LOG_NO_EVENT = 0,
@@ -688,7 +880,7 @@ typedef enum history_internal_events {
LOG_POOL_VDEV_OFFLINE,
LOG_POOL_UPGRADE,
LOG_POOL_CLEAR,
- LOG_POOL_SCRUB,
+ LOG_POOL_SCAN,
LOG_POOL_PROPSET,
LOG_DS_CREATE,
LOG_DS_CLONE,
@@ -711,7 +903,10 @@ typedef enum history_internal_events {
LOG_DS_UPGRADE,
LOG_DS_REFQUOTA,
LOG_DS_REFRESERV,
- LOG_POOL_SCRUB_DONE,
+ LOG_POOL_SCAN_DONE,
+ LOG_DS_USER_HOLD,
+ LOG_DS_USER_RELEASE,
+ LOG_POOL_SPLIT,
LOG_END
} history_internal_events_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
new file mode 100644
index 000000000000..36c9eaa7f18e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZUT_H
+#define _ZUT_H
+
+/*
+ * IOCTLs for the zfs unit test driver
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define ZUT_DRIVER "zut"
+#define ZUT_DEV "/dev/zut"
+
+#define ZUT_VERSION_STRING "1"
+
+/*
+ * /dev/zut ioctl numbers.
+ */
+#define ZUT_IOC ('U' << 8)
+
+/* Request flags */
+#define ZUT_IGNORECASE 0x01
+#define ZUT_ACCFILTER 0x02
+#define ZUT_XATTR 0x04
+#define ZUT_EXTRDDIR 0x08
+#define ZUT_GETSTAT 0x10
+
+typedef struct zut_lookup {
+ int zl_reqflags;
+ int zl_deflags; /* output */
+ int zl_retcode; /* output */
+ char zl_dir[MAXPATHLEN];
+ char zl_file[MAXNAMELEN];
+ char zl_xfile[MAXNAMELEN];
+ char zl_real[MAXPATHLEN]; /* output */
+ uint64_t zl_xvattrs; /* output */
+ struct stat64 zl_statbuf; /* output */
+} zut_lookup_t;
+
+typedef struct zut_readdir {
+ uint64_t zr_buf; /* pointer to output buffer */
+ uint64_t zr_loffset; /* output */
+ char zr_dir[MAXPATHLEN];
+ char zr_file[MAXNAMELEN];
+ int zr_reqflags;
+ int zr_retcode; /* output */
+ int zr_eof; /* output */
+ uint_t zr_bytes; /* output */
+ uint_t zr_buflen;
+} zut_readdir_t;
+
+typedef enum zut_ioc {
+ ZUT_IOC_MIN_CMD = ZUT_IOC - 1,
+ ZUT_IOC_LOOKUP = ZUT_IOC,
+ ZUT_IOC_READDIR,
+ ZUT_IOC_MAX_CMD
+} zut_ioc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZUT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
index 97f7ed66ae09..f3fc634fde71 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
@@ -20,15 +20,16 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * These are Consolidation Private interfaces and are subject to change.
*/
#ifndef _SYS_GFS_H
#define _SYS_GFS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/mutex.h>
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
index 3a405e409708..39eeb905c72b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
@@ -19,14 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_IDMAP_H
#define _SYS_IDMAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
/* Idmap status codes */
#define IDMAP_SUCCESS 0
@@ -64,12 +63,13 @@
#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970
#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969
#define IDMAP_ERR_BAD_UTF8 -9968
-#define IDMAP_ERR_NONEGENERATED -9967
+#define IDMAP_ERR_NONE_GENERATED -9967
#define IDMAP_ERR_PROP_UNKNOWN -9966
#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965
#define IDMAP_ERR_NS_LDAP_PARTIAL -9964
#define IDMAP_ERR_NS_LDAP_CFG -9963
#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962
+#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961
/* Reserved GIDs for some well-known SIDs */
#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */
@@ -90,4 +90,8 @@
*/
#define IDMAP_MAX_DOOR_RPC (256 * 1024)
+#define IDMAP_SENTINEL_PID UINT32_MAX
+#define IDMAP_ID_IS_EPHEMERAL(pid) \
+ (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID))
+
#endif /* _SYS_IDMAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
index 19faf42f7311..f5156148bcbd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
@@ -452,6 +452,12 @@ extern "C" {
#elif defined(__powerpc__)
+#if defined(__BIG_ENDIAN__)
+#define _BIT_FIELDS_HTOL
+#else
+#define _BIT_FIELDS_LTOH
+#endif
+
/*
* The following set of definitions characterize the Solaris on SPARC systems.
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
index cc578674318a..abf84cf593e9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_NVPAIR_H
#define _SYS_NVPAIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/time.h>
#include <sys/errno.h>
@@ -160,6 +157,8 @@ int nvlist_unpack(char *, size_t, nvlist_t **, int);
int nvlist_dup(nvlist_t *, nvlist_t **, int);
int nvlist_merge(nvlist_t *, nvlist_t *, int);
+uint_t nvlist_nvflag(nvlist_t *);
+
int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
@@ -199,6 +198,7 @@ int nvlist_add_double(nvlist_t *, const char *, double);
int nvlist_remove(nvlist_t *, const char *, data_type_t);
int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
int nvlist_lookup_boolean(nvlist_t *, const char *);
int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
@@ -237,9 +237,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
int *, char **);
boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);
/* processing nvpair */
nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
char *nvpair_name(nvpair_t *);
data_type_t nvpair_type(nvpair_t *);
int nvpair_type_is_array(nvpair_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
index 3a76c8c9b420..c0fe6e21b85f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
@@ -32,8 +32,6 @@
#ifndef _SYS_PROCESSOR_H
#define _SYS_PROCESSOR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/procset.h>
@@ -140,6 +138,7 @@ extern lgrpid_t gethomelgroup();
* Internal interface prototypes
*/
extern int p_online_internal(processorid_t, int, int *);
+extern int p_online_internal_locked(processorid_t, int, int *);
#endif /* !_KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
index 0a61e41de849..3558c2a7f03d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,14 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SYSEVENT_H
#define _SYS_SYSEVENT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#ifdef __cplusplus
@@ -164,18 +161,50 @@ typedef struct sysevent_value {
#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */
/*
- * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to
- * the consolidation private interface.
+ * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to
+ * the consolidation private interface, so flags defined here are restricted
+ * to the LSB.
+ *
+ * EVCH_SUB_KEEP indicates that this subscription should persist even if
+ * this subscriber id should die unexpectedly; matching events will be
+ * queued (up to a limit) and will be delivered if/when we restart again
+ * with the same subscriber id.
+ */
+#define EVCH_SUB_KEEP 0x01
+
+/*
+ * Subscriptions may be wildcarded, but we limit the number of
+ * wildcards permitted.
+ */
+#define EVCH_WILDCARD_MAX 10
+
+/*
+ * Used in unsubscribe to indicate all subscriber ids for a channel.
*/
-#define EVCH_SUB_KEEP 0x0001
#define EVCH_ALLSUB "all_subs"
/*
* Meaning of flags parameter of channel bind function
+ *
+ * EVCH_CREAT indicates to create a channel if not already present.
+ *
+ * EVCH_HOLD_PEND indicates that events should be published to this
+ * channel even if there are no matching subscribers present; when
+ * a subscriber belatedly binds to the channel and registers their
+ * subscriptions they will receive events that predate their bind.
+ * If the channel is closed, however, with no remaining bindings then
+ * the channel is destroyed.
+ *
+ * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND -
+ * even if the channel has no remaining bindings it will not be
+ * destroyed so long as events remain unconsumed. This is suitable for
+ * use with short-lived event producers that may bind to (create) the
+ * channel and exit before the intended consumer has started.
*/
-#define EVCH_CREAT 0x0001 /* Create a channel if not present */
+#define EVCH_CREAT 0x0001
#define EVCH_HOLD_PEND 0x0002
-#define EVCH_B_FLAGS 0x0003 /* All valid bits */
+#define EVCH_HOLD_PEND_INDEF 0x0004
+#define EVCH_B_FLAGS 0x0007 /* All valid bits */
/*
* Meaning of commands of evc_control function
@@ -185,38 +214,71 @@ typedef struct sysevent_value {
#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
+#ifdef sun
/*
- * Event channel interface definitions
+ * Shared user/kernel event channel interface definitions
*/
-int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
-void sysevent_evc_unbind(evchan_t *);
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+extern int sysevent_evc_unbind(evchan_t *);
+extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
int (*)(sysevent_t *, void *), void *, uint32_t);
-void sysevent_evc_unsubscribe(evchan_t *, const char *);
-int sysevent_evc_publish(evchan_t *, const char *, const char *,
+extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
+extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
const char *, const char *, nvlist_t *, uint32_t);
-int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
+extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
+#endif /* sun */
-#ifdef _KERNEL
+#ifndef _KERNEL
+
+#ifdef sun
+/*
+ * Userland-only event channel interfaces
+ */
+
+#include <door.h>
+
+typedef struct sysevent_subattr sysevent_subattr_t;
+
+extern sysevent_subattr_t *sysevent_subattr_alloc(void);
+extern void sysevent_subattr_free(sysevent_subattr_t *);
+
+extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *);
+extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *);
+
+extern void sysevent_subattr_thrcreate(sysevent_subattr_t *,
+ door_xcreate_server_func_t *, void *);
+extern void sysevent_subattr_thrsetup(sysevent_subattr_t *,
+ door_xcreate_thrsetup_func_t *, void *);
+
+extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
+ int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
+#endif /* sun */
+
+#else
/*
* Kernel log_event interfaces.
*/
-int log_sysevent(sysevent_t *, int, sysevent_id_t *);
-
-sysevent_t *sysevent_alloc(char *, char *, char *, int);
-void sysevent_free(sysevent_t *);
-int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int);
-void sysevent_free_attr(sysevent_attr_list_t *);
-int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
-void sysevent_detach_attributes(sysevent_t *);
-char *sysevent_get_class_name(sysevent_t *);
-char *sysevent_get_subclass_name(sysevent_t *);
-uint64_t sysevent_get_seq(sysevent_t *);
-void sysevent_get_time(sysevent_t *, hrtime_t *);
-size_t sysevent_get_size(sysevent_t *);
-char *sysevent_get_pub(sysevent_t *);
-int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+extern int log_sysevent(sysevent_t *, int, sysevent_id_t *);
+
+extern sysevent_t *sysevent_alloc(char *, char *, char *, int);
+extern void sysevent_free(sysevent_t *);
+extern int sysevent_add_attr(sysevent_attr_list_t **, char *,
+ sysevent_value_t *, int);
+extern void sysevent_free_attr(sysevent_attr_list_t *);
+extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
+extern void sysevent_detach_attributes(sysevent_t *);
+#ifdef sun
+extern char *sysevent_get_class_name(sysevent_t *);
+extern char *sysevent_get_subclass_name(sysevent_t *);
+extern uint64_t sysevent_get_seq(sysevent_t *);
+extern void sysevent_get_time(sysevent_t *, hrtime_t *);
+extern size_t sysevent_get_size(sysevent_t *);
+extern char *sysevent_get_pub(sysevent_t *);
+extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+#endif /* sun */
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
new file mode 100644
index 000000000000..9d3107d09011
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSEVENT_DEV_H
+#define _SYS_SYSEVENT_DEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/sysevent/eventdefs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Event schema for EC_DEV_ADD/ESC_DISK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_DISK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Event Class - EC_DEV_BRANCH
+ * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path to the root node of the device subtree
+ * without the "/devices" prefix.
+ */
+
+#define EV_VERSION "version"
+#define DEV_PHYS_PATH "phys_path"
+#define DEV_NAME "dev_name"
+#define DEV_DRIVER_NAME "driver_name"
+#define DEV_INSTANCE "instance"
+#define DEV_PROP_PREFIX "prop-"
+
+#define EV_V1 1
+
+/* maximum number of devinfo node properties added to the event */
+#define MAX_PROP_COUNT 100
+
+/* only properties with size less than PROP_LEN_LIMIT are added to the event */
+#define PROP_LEN_LIMIT 1024
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_DEV_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
index c46223f76b18..dfa78179bb5f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SYSEVENT_EVENTDEFS_H
@@ -52,6 +51,7 @@ extern "C" {
#define EC_FM "EC_fm" /* FMA error report event */
#define EC_ZFS "EC_zfs" /* ZFS event */
#define EC_DATALINK "EC_datalink" /* datalink event */
+#define EC_VRRP "EC_vrrp" /* VRRP event */
/*
* The following event class is reserved for exclusive use
@@ -179,6 +179,8 @@ extern "C" {
/* Interface within an IPMP group has changed state or type */
#define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change"
+/* IPMP probe has changed state */
+#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state"
/*
* EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes
@@ -200,9 +202,16 @@ extern "C" {
/* device tree branch removed */
#define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove"
-/* device capacity dynamically changed */
+/*
+ * EC_DEV_STATUS subclass definitions
+ *
+ * device capacity dynamically changed
+ */
#define ESC_DEV_DLE "ESC_dev_dle"
+/* LUN has received an eject request from the user */
+#define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request"
+
/* FMA Fault and Error event protocol subclass */
#define ESC_FM_ERROR "ESC_FM_error"
#define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay"
@@ -223,26 +232,43 @@ extern "C" {
#define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up"
#define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down"
+/* EC_ACPIEV subclass definitions */
+#define EC_ACPIEV "EC_acpiev"
+#define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch"
+#define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock"
+#define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep"
+#define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute"
+#define ESC_ACPIEV_WIFI "ESC_acpiev_wifi"
+#define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad"
+
/*
* ZFS subclass definitions. supporting attributes (name/value paris) are found
* in sys/fs/zfs.h
*/
-#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
-#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
-#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
-#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
-#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
-#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
-#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
-#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
-#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
-#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare"
+#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
+#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
+#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
+#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
+#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
+#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
+#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
+#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
+#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
+#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare"
+#define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach"
+#define ESC_ZFS_VDEV_AUTOEXPAND "ESC_ZFS_vdev_autoexpand"
/*
* datalink subclass definitions.
*/
#define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */
+/*
+ * VRRP subclass definitions. Supporting attributes (name/value paris) are
+ * found in sys/sysevent/vrrp.h
+ */
+#define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change"
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
index 22f9fe36601d..18f20908191f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
@@ -31,6 +31,7 @@
#define _SYS_SYSMACROS_H
#include <sys/param.h>
+#include <sys/isa_defs.h>
#ifdef __cplusplus
extern "C" {
@@ -57,6 +58,9 @@ extern "C" {
#ifndef ABS
#define ABS(a) ((a) < 0 ? -(a) : (a))
#endif
+#ifndef SIGNOF
+#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0)
+#endif
#ifdef _KERNEL
@@ -108,7 +112,7 @@ extern unsigned char bcd_to_byte[256];
#define L_MAXMIN L_MAXMIN32
#endif
-#if defined(sun)
+#ifdef sun
#ifdef _KERNEL
/* major part of a device internal to the kernel */
@@ -168,7 +172,6 @@ extern unsigned char bcd_to_byte[256];
#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
#define geteminor(x) (minor_t)((x) & L_MAXMIN)
-
#endif /* sun */
/*
@@ -371,6 +374,41 @@ extern unsigned char bcd_to_byte[256];
#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
#endif
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+static __inline int
+highbit(ulong_t i)
+{
+ register int h = 1;
+
+ if (i == 0)
+ return (0);
+#ifdef _LP64
+ if (i & 0xffffffff00000000ul) {
+ h += 32; i >>= 32;
+ }
+#endif
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
index 3878ded5e973..fb3f76d7765c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
@@ -45,6 +45,8 @@ typedef struct taskq taskq_t;
typedef uintptr_t taskqid_t;
typedef void (task_func_t)(void *);
+struct proc;
+
/*
* Public flags for taskq_create(): bit range 0-15
*/
@@ -52,6 +54,7 @@ typedef void (task_func_t)(void *);
#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */
+#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */
/*
* Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
@@ -61,6 +64,7 @@ typedef void (task_func_t)(void *);
#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
+#define TQ_FRONT 0x08 /* Put task at the front of the queue */
#ifdef _KERNEL
@@ -72,6 +76,10 @@ extern void taskq_mp_init(void);
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int,
int, uint_t);
+extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
+ struct proc *, uint_t);
+extern taskq_t *taskq_create_sysdc(const char *, int, int, int,
+ struct proc *, uint_t, uint_t);
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void nulltask(void *);
extern void taskq_destroy(taskq_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
index d60721c5a786..77c9c0bf84fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
@@ -36,6 +36,30 @@
extern "C" {
#endif
+#ifdef sun
+/*
+ * Unicode encoding conversion functions and their macros.
+ */
+#define UCONV_IN_BIG_ENDIAN 0x0001
+#define UCONV_OUT_BIG_ENDIAN 0x0002
+#define UCONV_IN_SYSTEM_ENDIAN 0x0004
+#define UCONV_OUT_SYSTEM_ENDIAN 0x0008
+#define UCONV_IN_LITTLE_ENDIAN 0x0010
+#define UCONV_OUT_LITTLE_ENDIAN 0x0020
+#define UCONV_IGNORE_NULL 0x0040
+#define UCONV_IN_ACCEPT_BOM 0x0080
+#define UCONV_OUT_EMIT_BOM 0x0100
+
+extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *,
+ int);
+extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *,
+ int);
+extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
+extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
+#endif /* sun */
+
/*
* UTF-8 text preparation functions and their macros.
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
index ab95b99b9ce6..974c915dd553 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
@@ -18,9 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -41,10 +41,6 @@
#include_next <sys/vnode.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
#define IS_DEVVP(vp) \
((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
@@ -69,6 +65,10 @@ typedef struct xoptattr {
uint8_t xoa_av_quarantined;
uint8_t xoa_av_modified;
uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+ uint8_t xoa_reparse;
+ uint64_t xoa_generation;
+ uint8_t xoa_offline;
+ uint8_t xoa_sparse;
} xoptattr_t;
/*
@@ -209,11 +209,15 @@ typedef struct xvattr {
#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */
#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */
#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */
+#define XAT0_REPARSE 0x00002000 /* FS reparse point */
+#define XAT0_GEN 0x00004000 /* object generation number */
+#define XAT0_OFFLINE 0x00008000 /* offline */
+#define XAT0_SPARSE 0x00010000 /* sparse */
#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
- XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| \
- XAT0_AV_MODIFIED|XAT0_AV_SCANSTAMP)
+ XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \
+ XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
/* Support for XAT_* optional attributes */
#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */
@@ -246,6 +250,10 @@ typedef struct xvattr {
#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
+#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
+#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
+#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
+#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
/*
* The returned attribute map array (xva_rtnattrmap[]) is located past the
@@ -305,7 +313,6 @@ typedef struct xvattr {
#define MODEMASK 07777 /* mode bits plus permission bits */
#define PERMMASK 00777 /* permission bits */
-
/*
* VOP_ACCESS flags
*/
@@ -358,15 +365,12 @@ typedef struct caller_context {
ulong_t cc_flags;
} caller_context_t;
-/*
- * Structure tags for function prototypes, defined elsewhere.
- */
struct taskq;
/*
* Flags for VOP_LOOKUP
*
- * Defined in file.h, but also possible, FIGNORECASE
+ * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
*
*/
#define LOOKUP_DIR 0x01 /* want parent dir vp */