aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts/common/fs
diff options
context:
space:
mode:
authorPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commit1ba4a712dde6e6c613fc411a96958b4ade67de4c (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs
parent8fc061164d74a4c9775f39da3c0b5d02112866c8 (diff)
downloadsrc-1ba4a712dde6e6c613fc411a96958b4ade67de4c.tar.gz
src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.zip
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Notes
Notes: svn path=/head/; revision=185029
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c397
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c2295
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c53
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c405
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c285
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c597
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c841
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c36
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c403
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2413
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c735
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c496
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c395
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c313
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c929
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c249
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c32
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c3038
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c232
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c121
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c776
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c27
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h131
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h160
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h72
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h144
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h216
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c1206
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c445
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c547
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c155
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c84
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c180
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c178
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c421
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c2499
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c423
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c326
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c716
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c2294
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c456
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c579
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c796
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c1762
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c377
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c2470
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c715
110 files changed, 27710 insertions, 9431 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index dd2aa82304ab..d9eb88a40202 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -20,7 +20,7 @@
*/
/* Portions Copyright 2007 Shivakumar GN */
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/mutex.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
+#include <sys/sunddi.h>
#include <sys/uio.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
@@ -60,7 +61,7 @@
*
* These routines are designed to play a support role for existing
* pseudo-filesystems (such as procfs). They simplify common tasks,
- * without enforcing the filesystem to hand over management to GFS. The
+ * without forcing the filesystem to hand over management to GFS. The
* routines covered are:
*
* gfs_readdir_init()
@@ -116,6 +117,42 @@
*/
/*
+ * gfs_get_parent_ino: used to obtain a parent inode number and the
+ * inode number of the given vnode in preparation for calling gfs_readdir_init.
+ */
+int
+gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ ino64_t *pino, ino64_t *ino)
+{
+ vnode_t *parent;
+ gfs_dir_t *dp = dvp->v_data;
+ int error;
+
+ *ino = dp->gfsd_file.gfs_ino;
+ parent = dp->gfsd_file.gfs_parent;
+
+ if (parent == NULL) {
+ *pino = *ino; /* root of filesystem */
+ } else if (dvp->v_flag & V_XATTRDIR) {
+#ifdef TODO
+ vattr_t va;
+
+ va.va_mask = AT_NODEID;
+ error = VOP_GETATTR(parent, &va, 0, cr, ct);
+ if (error)
+ return (error);
+ *pino = va.va_nodeid;
+#else
+ panic("%s:%u: not implemented", __func__, __LINE__);
+#endif
+ } else {
+ *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
+ }
+
+ return (0);
+}
+
+/*
* gfs_readdir_init: initiate a generic readdir
* st - a pointer to an uninitialized gfs_readdir_state_t structure
* name_max - the directory's maximum file name length
@@ -123,6 +160,7 @@
* uiop - the uiop passed to readdir
* parent - the parent directory's inode
* self - this directory's inode
+ * flags - flags from VOP_READDIR
*
* Returns 0 or a non-zero errno.
*
@@ -153,8 +191,10 @@
*/
int
gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
- uio_t *uiop, ino64_t parent, ino64_t self)
+ uio_t *uiop, ino64_t parent, ino64_t self, int flags)
{
+ size_t dirent_size;
+
if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
(uiop->uio_loffset % ureclen) != 0)
return (EINVAL);
@@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
st->grd_ureclen = ureclen;
st->grd_oresid = uiop->uio_resid;
st->grd_namlen = name_max;
- st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
+ if (flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
st->grd_parent = parent;
st->grd_self = self;
+ st->grd_flags = flags;
return (0);
}
@@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
/*
* gfs_readdir_emit_int: internal routine to emit directory entry
*
- * st - the current readdir state, which must have d_ino and d_name
- * set
+ * st - the current readdir state, which must have d_ino/ed_ino
+ * and d_name/ed_name set
* uiop - caller-supplied uio pointer
* next - the offset of the next entry
*/
@@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
int *ncookies, u_long **cookies)
{
int reclen, namlen;
+ dirent64_t *dp;
+ edirent_t *edp;
- namlen = strlen(st->grd_dirent->d_name);
- reclen = DIRENT64_RECLEN(namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp = st->grd_dirent;
+ namlen = strlen(edp->ed_name);
+ reclen = EDIRENT_RECLEN(namlen);
+ } else {
+ dp = st->grd_dirent;
+ namlen = strlen(dp->d_name);
+ reclen = DIRENT64_RECLEN(namlen);
+ }
if (reclen > uiop->uio_resid) {
/*
@@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
return (-1);
}
- /* XXX: This can change in the future. */
- st->grd_dirent->d_type = DT_DIR;
- st->grd_dirent->d_reclen = (ushort_t)reclen;
- st->grd_dirent->d_namlen = namlen;
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp->ed_off = next;
+ edp->ed_reclen = (ushort_t)reclen;
+ } else {
+ /* XXX: This can change in the future. */
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_type = DT_DIR;
+ dp->d_namlen = namlen;
+ }
if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
return (EFAULT);
@@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
* voff - the virtual offset (obtained from gfs_readdir_pred)
* ino - the entry's inode
* name - the entry's name
+ * eflags - value for ed_eflags (if processing edirent_t)
*
* Returns a 0 on success, a non-zero errno on failure, or -1 if the
* readdir loop should terminate. A non-zero result (either errno or
@@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
*/
int
gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
- ino64_t ino, const char *name, int *ncookies, u_long **cookies)
+ ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies)
{
offset_t off = (voff + 2) * st->grd_ureclen;
- st->grd_dirent->d_ino = ino;
- (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edirent_t *edp = st->grd_dirent;
+
+ edp->ed_ino = ino;
+ (void) strncpy(edp->ed_name, name, st->grd_namlen);
+ edp->ed_eflags = eflags;
+ } else {
+ dirent64_t *dp = st->grd_dirent;
+
+ dp->d_ino = ino;
+ (void) strncpy(dp->d_name, name, st->grd_namlen);
+ }
/*
* Inter-entry offsets are invalid, so we assume a record size of
@@ -266,11 +336,11 @@ top:
voff = off - 2;
if (off == 0) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
- ".", ncookies, cookies)) == 0)
+ ".", 0, ncookies, cookies)) == 0)
goto top;
} else if (off == 1) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
- "..", ncookies, cookies)) == 0)
+ "..", 0, ncookies, cookies)) == 0)
goto top;
} else {
*voffp = voff;
@@ -292,7 +362,13 @@ top:
int
gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
{
- kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
+ size_t dirent_size;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ kmem_free(st->grd_dirent, dirent_size);
if (error > 0)
return (error);
if (eofp)
@@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp)
gfs_dir_t *dp = NULL;
void *data;
- if (fp->gfs_parent == NULL)
+ if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
goto found;
dp = fp->gfs_parent->v_data;
@@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp)
ge = NULL;
found:
+ if (vp->v_flag & V_XATTRDIR)
+ VI_LOCK(fp->gfs_parent);
VI_LOCK(vp);
ASSERT(vp->v_count < 2);
/*
@@ -535,7 +613,8 @@ found:
* Free vnode and release parent
*/
if (fp->gfs_parent) {
- gfs_dir_unlock(dp);
+ if (dp)
+ gfs_dir_unlock(dp);
VI_LOCK(fp->gfs_parent);
fp->gfs_parent->v_usecount--;
VI_UNLOCK(fp->gfs_parent);
@@ -543,6 +622,8 @@ found:
ASSERT(vp->v_vfsp != NULL);
VFS_RELE(vp->v_vfsp);
}
+ if (vp->v_flag & V_XATTRDIR)
+ VI_UNLOCK(fp->gfs_parent);
return (data);
}
@@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp)
}
/*
- * gfs_dir_lookup()
+ * gfs_dir_lookup_dynamic()
*
- * Looks up the given name in the directory and returns the corresponding vnode,
- * if found.
+ * This routine looks up the provided name amongst the dynamic entries
+ * in the gfs directory and returns the corresponding vnode, if found.
*
- * First, we search statically defined entries, if any. If a match is found,
- * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
- * existing vnode. Otherwise, we call the static entry's callback routine,
- * caching the result if necessary.
+ * The gfs directory is expected to be locked by the caller prior to
+ * calling this function. The directory will be unlocked during the
+ * execution of this function, but will be locked upon return from the
+ * function. This function returns 0 on success, non-zero on error.
*
- * If no static entry is found, we invoke the lookup callback, if any. The
- * arguments to this callback are:
+ * The dynamic lookups are performed by invoking the lookup
+ * callback, which is passed to this function as the first argument.
+ * The arguments to the callback are:
*
- * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ * int flags, int *deflgs, pathname_t *rpnp);
*
* pvp - parent vnode
* nm - name of entry
* vpp - pointer to resulting vnode
+ * cr - pointer to cred
+ * flags - flags value from lookup request
+ * ignored here; currently only used to request
+ * insensitive lookups
+ * direntflgs - output parameter, directory entry flags
+ * ignored here; currently only used to indicate a lookup
+ * has more than one possible match when case is not considered
+ * realpnp - output parameter, real pathname
+ * ignored here; when lookup was performed case-insensitively,
+ * this field contains the "real" name of the file.
*
* Returns 0 on success, non-zero on error.
*/
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
+static int
+gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
+ const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
+ int *direntflags, pathname_t *realpnp)
{
- int i;
- gfs_dirent_t *ge;
- vnode_t *vp;
- gfs_dir_t *dp = dvp->v_data;
- int ret = 0;
-
- ASSERT(dvp->v_type == VDIR);
+ gfs_file_t *fp;
+ ino64_t ino;
+ int ret;
- if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
- return (0);
+ ASSERT(GFS_DIR_LOCKED(dp));
+ /*
+ * Drop the directory lock, as the lookup routine
+ * will need to allocate memory, or otherwise deadlock on this
+ * directory.
+ */
+ gfs_dir_unlock(dp);
+ ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
gfs_dir_lock(dp);
/*
+ * The callback for extended attributes returns a vnode
+ * with v_data from an underlying fs.
+ */
+ if (ret == 0 && !IS_XATTRDIR(dvp)) {
+ fp = (gfs_file_t *)((*vpp)->v_data);
+ fp->gfs_index = -1;
+ fp->gfs_ino = ino;
+ }
+
+ return (ret);
+}
+
+/*
+ * gfs_dir_lookup_static()
+ *
+ * This routine looks up the provided name amongst the static entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ * The first argument to the function is a pointer to the comparison
+ * function this function should use to decide if names are a match.
+ *
+ * If a match is found, and GFS_CACHE_VNODE is set and the vnode
+ * exists, we simply return the existing vnode. Otherwise, we call
+ * the static entry's callback routine, caching the result if
+ * necessary. If the idx pointer argument is non-NULL, we use it to
+ * return the index of the matching static entry.
+ *
+ * The gfs directory is expected to be locked by the caller prior to calling
+ * this function. The directory may be unlocked during the execution of
+ * this function, but will be locked upon return from the function.
+ *
+ * This function returns 0 if a match is found, ENOENT if not.
+ */
+static int
+gfs_dir_lookup_static(int (*compare)(const char *, const char *),
+ gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
+ vnode_t **vpp, pathname_t *rpnp)
+{
+ gfs_dirent_t *ge;
+ vnode_t *vp = NULL;
+ int i;
+
+ ASSERT(GFS_DIR_LOCKED(dp));
+
+ /*
* Search static entries.
*/
for (i = 0; i < dp->gfsd_nstatic; i++) {
ge = &dp->gfsd_static[i];
- if (strcmp(ge->gfse_name, nm) == 0) {
+ if (compare(ge->gfse_name, nm) == 0) {
+ if (rpnp)
+ (void) strlcpy(rpnp->pn_buf, ge->gfse_name,
+ rpnp->pn_bufsize);
+
if (ge->gfse_vnode) {
ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
vp = ge->gfse_vnode;
VN_HOLD(vp);
- goto out;
+ break;
}
/*
@@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
* need to do KM_SLEEP allocations. If we return from
* the constructor only to find that a parallel
* operation has completed, and GFS_CACHE_VNODE is set
- * for this entry, we discard the result in favor of the
- * cached vnode.
+ * for this entry, we discard the result in favor of
+ * the cached vnode.
*/
gfs_dir_unlock(dp);
vp = ge->gfse_ctor(dvp);
@@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
gfs_dir_lock(dp);
}
}
-
- goto out;
+ break;
}
}
- /*
- * See if there is a dynamic constructor.
- */
- if (dp->gfsd_lookup) {
- ino64_t ino;
- gfs_file_t *fp;
+ if (vp == NULL)
+ return (ENOENT);
+ else if (idx)
+ *idx = i;
+ *vpp = vp;
+ return (0);
+}
- /*
- * Once again, drop the directory lock, as the lookup routine
- * will need to allocate memory, or otherwise deadlock on this
- * directory.
- */
- gfs_dir_unlock(dp);
- ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
- gfs_dir_lock(dp);
- if (ret != 0)
- goto out;
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding
+ * vnode, if found.
+ *
+ * First, we search statically defined entries, if any, with a call to
+ * gfs_dir_lookup_static(). If no static entry is found, and we have
+ * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
+ *
+ * This function returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ int flags, int *direntflags, pathname_t *realpnp)
+{
+ gfs_dir_t *dp = dvp->v_data;
+ boolean_t casecheck;
+ vnode_t *dynvp = NULL;
+ vnode_t *vp = NULL;
+ int (*compare)(const char *, const char *);
+ int error, idx;
- fp = (gfs_file_t *)vp->v_data;
- fp->gfs_index = -1;
- fp->gfs_ino = ino;
- } else {
- /*
- * No static entry found, and there is no lookup callback, so
- * return ENOENT.
- */
- ret = ENOENT;
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+ return (0);
+
+ casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
+ if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
+ (flags & FIGNORECASE))
+ compare = strcasecmp;
+ else
+ compare = strcmp;
+
+ gfs_dir_lock(dp);
+
+ error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
+
+ if (vp && casecheck) {
+ gfs_dirent_t *ge;
+ int i;
+
+ for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (strcasecmp(ge->gfse_name, nm) == 0) {
+ *direntflags |= ED_CASE_CONFLICT;
+ goto out;
+ }
+ }
+ }
+
+ if ((error || casecheck) && dp->gfsd_lookup)
+ error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
+ &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
+
+ if (vp && dynvp) {
+ /* static and dynamic entries are case-insensitive conflict */
+ ASSERT(casecheck);
+ *direntflags |= ED_CASE_CONFLICT;
+ VN_RELE(dynvp);
+ } else if (vp == NULL) {
+ vp = dynvp;
+ } else if (error == ENOENT) {
+ error = 0;
+ } else if (error) {
+ VN_RELE(vp);
+ vp = NULL;
}
out:
gfs_dir_unlock(dp);
- if (ret == 0)
- *vpp = vp;
- else
- *vpp = NULL;
-
- return (ret);
+ *vpp = vp;
+ return (error);
}
/*
@@ -731,13 +921,15 @@ out:
* This is significantly more complex, thanks to the particulars of
* VOP_READDIR().
*
- * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- * offset_t *off, offset_t *nextoff, void *data)
+ * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+ * offset_t *off, offset_t *nextoff, void *data, int flags)
*
* vp - directory vnode
* dp - directory entry, sized according to maxlen given to
* gfs_dir_create(). callback must fill in d_name and
- * d_ino.
+ * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
+ * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
+ * is set in 'flags'.
* eofp - callback must set to 1 when EOF has been reached
* off - on entry, the last offset read from the directory. Callback
* must set to the offset of the current entry, typically left
@@ -745,12 +937,13 @@ out:
* nextoff - callback must set to offset of next entry. Typically
* (off + 1)
* data - caller-supplied data
+ * flags - VOP_READDIR flags
*
* Return 0 on success, or error on failure.
*/
int
gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
- u_long **cookies, void *data)
+ u_long **cookies, void *data, cred_t *cr, int flags)
{
gfs_readdir_state_t gstate;
int error, eof = 0;
@@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
offset_t off, next;
gfs_dir_t *dp = dvp->v_data;
- ino = dp->gfsd_file.gfs_ino;
-
- if (dp->gfsd_file.gfs_parent == NULL)
- pino = ino; /* root of filesystem */
- else
- pino = ((gfs_file_t *)
- (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
+ error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino);
+ if (error)
+ return (error);
if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
- pino, ino)) != 0)
+ pino, ino, flags)) != 0)
return (error);
while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
@@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
ino = dp->gfsd_inode(dvp, off);
if ((error = gfs_readdir_emit(&gstate, uiop,
- off, ino, dp->gfsd_static[off].gfse_name, ncookies,
- cookies)) != 0)
+ off, ino, dp->gfsd_static[off].gfse_name, 0,
+ ncookies, cookies)) != 0)
break;
} else if (dp->gfsd_readdir) {
@@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
if ((error = dp->gfsd_readdir(dvp,
gstate.grd_dirent, &eof, &off, &next,
- data)) != 0 || eof)
+ data, flags)) != 0 || eof)
break;
off += dp->gfsd_nstatic + 2;
@@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
}
/*
+ * gfs_vop_lookup: VOP_LOOKUP() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_lookup() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
+}
+
+/*
* gfs_vop_readdir: VOP_READDIR() entry point
*
* For use directly in vnode ops table. Given a GFS directory, calls
@@ -827,6 +1031,7 @@ gfs_vop_readdir(ap)
{
vnode_t *vp = ap->a_vp;
uio_t *uiop = ap->a_uio;
+ cred_t *cr = ap->a_cred;
int *eofp = ap->a_eofflag;
int ncookies = 0;
u_long *cookies = NULL;
@@ -842,7 +1047,8 @@ gfs_vop_readdir(ap)
*ap->a_ncookies = ncookies;
}
- error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
+ error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL,
+ cr, 0);
if (error == 0) {
/* Subtract unused cookies */
@@ -882,6 +1088,9 @@ gfs_vop_inactive(ap)
if (data != NULL)
kmem_free(data, fp->gfs_size);
+
+ VI_LOCK(vp);
vp->v_data = NULL;
+ VI_UNLOCK(vp);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..00a10aae8ec9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+ bzero(xvap, sizeof (xvattr_t));
+ xvap->xva_mapsize = XVA_MAPSIZE;
+ xvap->xva_magic = XVA_MAGIC;
+ xvap->xva_vattr.va_mask = AT_XVATTR;
+ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure. Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ if (xvap->xva_vattr.va_mask & AT_XVATTR)
+ xoap = &xvap->xva_xoptattrs;
+ return (xoap);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 420f802f360d..7ca528033c4f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DVA-based Adjustable Replacement Cache
*
@@ -47,13 +45,13 @@
* There are times when it is not possible to evict the requested
* space. In these circumstances we are unable to adjust the cache
* size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
*
* 2. The Megiddo and Modha model assumes a fixed cache size.
* Pages are evicted when the cache is full and there is a cache
* miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory preasure from the
+ * high use, but also tries to react to memory pressure from the
* operating system: decreasing its size when system memory is
* tight.
*
@@ -75,7 +73,7 @@
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() inerface
+ * or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
* adjusting the cache use method 2. We therefor provide two
* types of locks: 1) the hash table lock array, and 2) the
@@ -109,6 +107,14 @@
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
*/
#include <sys/spa.h>
@@ -117,6 +123,7 @@
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
+#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan;
static int arc_dead;
/*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
* These tunables are for performance analysis.
*/
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+int zfs_mdcomp_disable = 0;
+
+TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
+TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
"Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
"Minimum ARC size");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
- * Note that buffers can be on one of 5 states:
+ * Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc. These are the
- * only buffers that can be evicted or deleted.
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
*
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
@@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
* they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
* as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
*/
typedef struct arc_state {
- list_t arcs_list; /* linked list of evictable buffer in state */
- uint64_t arcs_lsize; /* total size of buffers in the linked list */
- uint64_t arcs_size; /* total size of all buffers in this state */
+ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
+ uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
} arc_state_t;
-/* The 5 states: */
+/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
static arc_state_t ARC_mru_ghost;
static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
@@ -222,6 +259,24 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_memory_throttle_count;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -252,7 +307,25 @@ static arc_stats_t arc_stats = {
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 }
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -299,6 +372,7 @@ static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
/*
* There are several ARC variables that are critical to export as kstats --
@@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost;
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
+static uint64_t arc_meta_used;
+static uint64_t arc_meta_limit;
+static uint64_t arc_meta_max = 0;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
+ &arc_meta_used, 0, "ARC metadata used");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
+ &arc_meta_limit, 0, "ARC metadata limit");
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
- arc_byteswap_func_t *acb_byteswap;
arc_buf_t *acb_buf;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
@@ -368,6 +450,9 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
};
static arc_buf_t *arc_eviction_list;
@@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
/*
* Private ARC flags. These flags are private ARC only flags that will show up
@@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
+#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
+ (hdr)->b_l2hdr != NULL)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
/*
* Hash table routines
@@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table;
uint64_t zfs_crc64_table[256];
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_FEED_SECS 1 /* caching interval */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_boost; /* warmup write boost, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ daddr_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
{
uintptr_t spav = (uintptr_t)spa;
uint8_t *vdva = (uint8_t *)dva;
@@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
bzero(buf, sizeof (arc_buf_hdr_t));
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
return (0);
}
@@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
+ mutex_destroy(&buf->b_freeze_lock);
+
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ rw_destroy(&buf->b_lock);
}
/*
@@ -639,7 +854,7 @@ retry:
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
@@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
static void
-arc_cksum_compute(arc_buf_t *buf)
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
mutex_enter(&buf->b_hdr->b_freeze_lock);
@@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf)
void
arc_buf_thaw(arc_buf_t *buf)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
- if (buf->b_hdr->b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
mutex_enter(&buf->b_hdr->b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
@@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf)
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
}
static void
@@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
uint64_t delta = ab->b_size * ab->b_datacnt;
+ list_t *list = &ab->b_state->arcs_list[ab->b_type];
+ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
mutex_enter(&ab->b_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&ab->b_state->arcs_list, ab);
+ list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
ASSERT3U(ab->b_datacnt, ==, 0);
ASSERT3P(ab->b_buf, ==, NULL);
delta = ab->b_size;
}
ASSERT(delta > 0);
- ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
- atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
mutex_exit(&ab->b_state->arcs_mtx);
- /* remove the prefetch flag is we get a reference */
+ /* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
}
@@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
+ uint64_t *size = &state->arcs_lsize[ab->b_type];
+
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list, ab);
+ list_insert_head(&state->arcs_list[ab->b_type], ab);
ASSERT(ab->b_datacnt > 0);
- atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
- ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+ atomic_add_64(size, ab->b_size * ab->b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
@@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
if (refcnt == 0) {
if (old_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ uint64_t *size = &old_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list, ab);
+ list_remove(&old_state->arcs_list[ab->b_type], ab);
/*
* If prefetching out of the ghost cache,
@@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
from_delta = ab->b_size;
}
- ASSERT3U(old_state->arcs_lsize, >=, from_delta);
- atomic_add_64(&old_state->arcs_lsize, -from_delta);
+ ASSERT3U(*size, >=, from_delta);
+ atomic_add_64(size, -from_delta);
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
}
if (new_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ uint64_t *size = &new_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
- list_insert_head(&new_state->arcs_list, ab);
+ list_insert_head(&new_state->arcs_list[ab->b_type], ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
to_delta = ab->b_size;
}
- atomic_add_64(&new_state->arcs_lsize, to_delta);
- ASSERT3U(new_state->arcs_size + to_delta, >=,
- new_state->arcs_lsize);
+ atomic_add_64(size, to_delta);
if (use_mutex)
mutex_exit(&new_state->arcs_mtx);
@@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && old_state != arc_anon) {
+ if (new_state == arc_anon) {
buf_hash_remove(ab);
}
@@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(&old_state->arcs_size, -from_delta);
}
ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space)
+{
+ atomic_add_64(&arc_meta_used, space);
+ atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space)
+{
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ atomic_add_64(&arc_meta_used, -space);
+ ASSERT(arc_size >= space);
+ atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+ if (arc_evict_needed(ARC_BUFC_DATA))
+ cv_signal(&arc_reclaim_thr_cv);
+ atomic_add_64(&arc_size, size);
+ return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+ zio_data_buf_free(buf, size);
+ ASSERT(arc_size >= size);
+ atomic_add_64(&arc_size, -size);
}
arc_buf_t *
@@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
hdr->b_spa = spa;
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
kmutex_t *hash_lock;
/*
- * Check to see if this buffer is currently being evicted via
- * arc_do_user_evicts().
+ * Check to see if this buffer is evicted. Callers
+ * must verify b_data != NULL to know if the add_ref
+ * was successful.
*/
- mutex_enter(&arc_eviction_mtx);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- mutex_exit(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_READER);
+ if (buf->b_data == NULL) {
+ rw_exit(&buf->b_lock);
return;
}
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
mutex_enter(hash_lock);
- if (buf->b_data == NULL) {
- /*
- * This buffer is evicted.
- */
- mutex_exit(hash_lock);
- return;
- }
+ rw_exit(&buf->b_lock);
- ASSERT(buf->b_hdr == hdr);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
arc_access(hdr, hash_lock);
@@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
data, metadata, hits);
}
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
static void
arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
{
@@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_cksum_verify(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
+ arc_space_return(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
+ atomic_add_64(&arc_size, -size);
}
- atomic_add_64(&arc_size, -size);
}
if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ uint64_t *cnt = &state->arcs_lsize[type];
+
ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
ASSERT(state != arc_anon);
- ASSERT3U(state->arcs_lsize, >=, size);
- atomic_add_64(&state->arcs_lsize, -size);
+
+ ASSERT3U(*cnt, >=, size);
+ atomic_add_64(cnt, -size);
}
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
@@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!(hdr->b_flags & ARC_STORED));
+
+ if (hdr->b_l2hdr != NULL) {
+ if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ if (hdr->b_l2hdr != NULL) {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
+ hdr);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+ } else {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ }
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
@@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- mutex_destroy(&hdr->b_freeze_lock);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf)
* - return the data block from this buffer rather than freeing it.
* This flag is used by callers that are trying to make space for a
* new buffer in a full arc cache.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
*/
static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type)
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
arc_buf_hdr_t *ab, *ab_prev = NULL;
+ list_t *list = &state->arcs_list[type];
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
@@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
+ (spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
skipped++;
@@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
+ if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ missed += 1;
+ break;
+ }
if (buf->b_data) {
bytes_evicted += ab->b_size;
if (recycle && ab->b_type == type &&
- ab->b_size == bytes) {
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
stolen = buf->b_data;
recycle = FALSE;
}
@@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
} else {
+ rw_exit(&buf->b_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
}
- ASSERT(ab->b_datacnt == 0);
- arc_change_state(evicted_state, ab, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags = ARC_IN_HASH_TABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (ab->b_datacnt == 0) {
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ }
if (!have_lock)
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
@@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
if (missed)
ARCSTAT_INCR(arcstat_mutex_miss, missed);
+ /*
+ * We have just evicted some date into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
+ */
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+
return (stolen);
}
@@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
@@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes)
ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ if (spa && ab->b_spa != spa)
+ continue;
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += ab->b_size;
- arc_hdr_destroy(ab);
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
@@ -1256,6 +1630,12 @@ top:
}
mutex_exit(&state->arcs_mtx);
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_deleted < bytes)) {
+ list = &state->arcs_list[ARC_BUFC_METADATA];
+ goto top;
+ }
+
if (bufs_skipped) {
ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
ASSERT(bytes >= 0);
@@ -1271,38 +1651,58 @@ arc_adjust(void)
{
int64_t top_sz, mru_over, arc_over, todelete;
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
- if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
- (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
}
mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
if (mru_over > 0) {
- if (arc_mru_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
- arc_evict_ghost(arc_mru_ghost, todelete);
+ if (arc_mru_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
}
}
if ((arc_over = arc_size - arc_c) > 0) {
int64_t tbl_over;
- if (arc_mfu->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
- (void) arc_evict(arc_mfu, toevict, FALSE,
- ARC_BUFC_UNDEF);
+ if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_DATA);
+ arc_over = arc_size - arc_c;
+ }
+
+ if (arc_over > 0 &&
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
+ arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
}
- tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
- arc_mfu_ghost->arcs_lsize - arc_c*2;
+ tbl_over = arc_size + arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c * 2;
- if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
- arc_evict_ghost(arc_mfu_ghost, todelete);
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
}
@@ -1314,7 +1714,9 @@ arc_do_user_evicts(void)
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
arc_eviction_list = buf->b_next;
+ rw_enter(&buf->b_lock, RW_WRITER);
buf->b_hdr = NULL;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
if (buf->b_efunc != NULL)
@@ -1329,24 +1731,40 @@ arc_do_user_evicts(void)
}
/*
- * Flush all *evictable* data from the cache.
+ * Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
void
-arc_flush(void)
+arc_flush(spa_t *spa)
{
- while (list_head(&arc_mru->arcs_list))
- (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
- while (list_head(&arc_mfu->arcs_list))
- (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
- arc_evict_ghost(arc_mru_ghost, -1);
- arc_evict_ghost(arc_mfu_ghost, -1);
+ arc_evict_ghost(arc_mru_ghost, spa, -1);
+ arc_evict_ghost(arc_mfu_ghost, spa, -1);
mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
mutex_exit(&arc_reclaim_thr_lock);
- ASSERT(arc_eviction_list == NULL);
+ ASSERT(spa || arc_eviction_list == NULL);
}
int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
@@ -1380,7 +1798,7 @@ arc_shrink(void)
arc_adjust();
}
-static int zfs_needfree = 0;
+static int needfree = 0;
static int
arc_reclaim_needed(void)
@@ -1391,13 +1809,28 @@ arc_reclaim_needed(void)
#ifdef _KERNEL
- if (zfs_needfree)
+ if (needfree)
return (1);
#if 0
/*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
* check to make sure that swapfs has enough space so that anon
- * reservations can still succeeed. anon_resvmem() checks that the
+ * reservations can still succeed. anon_resvmem() checks that the
* availrmem is greater than swapfs_minfree, and the number of reserved
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
@@ -1405,23 +1838,6 @@ arc_reclaim_needed(void)
if (availrmem < swapfs_minfree + swapfs_reserve + extra)
return (1);
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then check that the size of available vmem for this area remains
- * above 1/4th free. This needs to be done when the size of the
- * non-default segment is smaller than physical memory, so we could
- * conceivably run out of VA in that segment before running out of
- * physical memory.
- */
- if (zio_arena != NULL) {
- size_t arc_ziosize =
- btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
- if ((physmem > arc_ziosize) &&
- (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
- return (1);
- }
-
#if defined(__i386)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
@@ -1431,7 +1847,7 @@ arc_reclaim_needed(void)
* can have in the system. However, this is generally fixed at 25 pages
* which is so low that it's useless. In this comparison, we seek to
* calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
if (btop(vmem_size(heap_arena, VMEM_FREE)) <
@@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
#ifdef _KERNEL
- /*
- * First purge some DNLC entries, in case the DNLC is using
- * up too much memory.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
+ if (arc_meta_used >= arc_meta_limit) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
#if defined(__i386)
/*
* Reclaim unused memory from all kmem caches.
@@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
/*
- * An agressive reclamation will shrink the cache size as well as
+ * An aggressive reclamation will shrink the cache size as well as
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
@@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused)
/* reset the growth delay for every reclaim */
growtime = LBOLT + (arc_grow_retry * hz);
- ASSERT(growtime > 0);
- if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+ if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
/*
- * If zfs_needfree is TRUE our vm_lowmem hook
+ * If needfree is TRUE our vm_lowmem hook
* was called and in that case we must free some
* memory, so switch to aggressive mode.
*/
@@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused)
last_reclaim = ARC_RECLAIM_AGGR;
}
arc_kmem_reap_now(last_reclaim);
- } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
+ arc_warm = B_TRUE;
+
+ } else if (arc_no_grow && LBOLT >= growtime) {
arc_no_grow = FALSE;
}
- if (zfs_needfree ||
+ if (needfree ||
(2 * arc_c < arc_size +
arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
arc_adjust();
@@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused)
arc_do_user_evicts();
if (arc_reclaim_needed()) {
- zfs_needfree = 0;
+ needfree = 0;
#ifdef _KERNEL
- wakeup(&zfs_needfree);
+ wakeup(&needfree);
#endif
}
@@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
+ if (state == arc_l2c_only)
+ return;
+
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
@@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state)
* prior to insert.
*/
static int
-arc_evict_needed()
+arc_evict_needed(arc_buf_contents_t type)
{
+ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+ return (1);
+
+#if 0
+#ifdef _KERNEL
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this area remains
+ * above about 1/32nd free.
+ */
+ if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+ vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+ return (1);
+#endif
+#endif
+
if (arc_reclaim_needed())
return (1);
@@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf)
* We have not yet reached cache maximum size,
* just allocate a new buffer.
*/
- if (!arc_evict_needed()) {
+ if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
goto out;
}
@@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf)
if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+ state = (arc_mfu->arcs_lsize[type] > 0 &&
+ arc_p > mru_used) ? arc_mfu : arc_mru;
} else {
/* MFU cases */
uint64_t mfu_space = arc_c - arc_p;
- state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ state = (arc_mru->arcs_lsize[type] > 0 &&
+ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
@@ -1728,7 +2170,7 @@ out:
atomic_add_64(&hdr->b_state->arcs_size, size);
if (list_link_active(&hdr->b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize, size);
+ atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
}
/*
* If we are growing the cache, and we are adding anonymous
@@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
if (refcount_count(&buf->b_refcnt) == 0) {
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mru->arcs_mtx);
- list_remove(&arc_mru->arcs_list, buf);
- list_insert_head(&arc_mru->arcs_list, buf);
- mutex_exit(&arc_mru->arcs_mtx);
} else {
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
@@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mfu->arcs_mtx);
- list_remove(&arc_mfu->arcs_list, buf);
- list_insert_head(&arc_mfu->arcs_list, buf);
- mutex_exit(&arc_mfu->arcs_mtx);
}
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = LBOLT;
@@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = LBOLT;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
@@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_done_func_t */
void
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
@@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio)
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~ARC_L2_EVICTED;
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags &= ~ARC_L2CACHE;
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
- callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+ byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+ func(buf->b_data, hdr->b_size);
+ }
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
/* create copies of the data buffer for the callers */
abuf = buf;
@@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
- /* convert checksum errors into IO errors */
- if (zio->io_error == ECKSUM)
- zio->io_error = EIO;
}
/*
@@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio)
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp. But if you know you don't need locking, you can use
+ * arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t *arc_flags, zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ int err;
+ arc_buf_hdr_t *hdr = pbuf->b_hdr;
+
+ ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+ ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+ rw_enter(&pbuf->b_lock, RW_READER);
+
+ err = arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb);
+
+ ASSERT3P(hdr, ==, pbuf->b_hdr);
+ rw_exit(&pbuf->b_lock);
+ return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
kmutex_t *hash_lock;
- zio_t *rzio;
+ zio_t *rzio;
top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
@@ -2053,10 +2525,9 @@ top:
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, flags);
+ spa, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
acb->acb_next = hdr->b_acb;
@@ -2093,6 +2564,8 @@ top:
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2104,6 +2577,8 @@ top:
} else {
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ daddr_t addr;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -2130,6 +2605,8 @@ top:
private);
hdr->b_flags |= ARC_PREFETCH;
}
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_INDIRECT;
} else {
@@ -2144,7 +2621,9 @@ top:
hdr->b_flags |= ARC_PREFETCH;
else
add_reference(hdr, hash_lock, private);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -2160,7 +2639,6 @@ top:
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
ASSERT(hdr->b_acb == NULL);
hdr->b_acb = acb;
@@ -2176,6 +2654,18 @@ top:
if (GHOST_STATE(hdr->b_state))
arc_access(hdr, hash_lock);
+
+ if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
+ (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ addr = hdr->b_l2hdr->b_daddr;
+ /*
+ * Lock out device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
@@ -2186,8 +2676,65 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
+ if (vd != NULL) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ */
+ if (hdr->b_l2hdr != NULL &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority, zio_flags |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+
+ if (*arc_flags & ARC_NOWAIT) {
+ zio_nowait(rzio);
+ return (0);
+ }
+
+ ASSERT(*arc_flags & ARC_WAIT);
+ if (zio_wait(rzio) == 0)
+ return (0);
+
+ /* l2arc read error; goto zio_read() */
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ }
+
rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, flags, zb);
+ arc_read_done, buf, priority, zio_flags, zb);
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf)
kmutex_t *hash_lock;
arc_buf_t **bufp;
- mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
return (0);
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
- mutex_enter(hash_lock);
-
- if (buf->b_data == NULL) {
+ } else if (buf->b_data == NULL) {
+ arc_buf_t copy = *buf; /* structure assignment */
/*
- * We are on the eviction list.
+ * We are on the eviction list; process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
*/
- mutex_exit(hash_lock);
- mutex_enter(&arc_eviction_mtx);
- if (buf->b_hdr == NULL) {
- /*
- * We are already in arc_do_user_evicts().
- */
- mutex_exit(&arc_eviction_mtx);
- return (0);
- } else {
- arc_buf_t copy = *buf; /* structure assignment */
- /*
- * Process this buffer now
- * but let arc_do_user_evicts() do the reaping.
- */
- buf->b_efunc = NULL;
- mutex_exit(&arc_eviction_mtx);
- VERIFY(copy.b_efunc(&copy) == 0);
- return (1);
- }
+ buf->b_efunc = NULL;
+ rw_exit(&buf->b_lock);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
}
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
@@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf)
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags = ARC_IN_HASH_TABLE;
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&old_state->arcs_mtx);
}
mutex_exit(hash_lock);
+ rw_exit(&buf->b_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
@@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf)
* Release this buffer from the cache. This must be done
* after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
+ * a new hdr for the buffer.
*/
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ l2arc_buf_hdr_t *l2hdr;
+ uint64_t buf_size;
+
+ rw_enter(&buf->b_lock, RW_WRITER);
+ hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
@@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf);
+ rw_exit(&buf->b_lock);
return;
}
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ l2hdr = hdr->b_l2hdr;
+ if (l2hdr) {
+ mutex_enter(&l2arc_buflist_mtx);
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+
/*
* Do we have more than one buf?
*/
- if (hdr->b_buf != buf || buf->b_next != NULL) {
+ if (hdr->b_datacnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
- ASSERT(hdr->b_datacnt > 1);
+ ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
* Pull the data off of this buf and attach it to
* a new anonymous buf.
@@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
if (refcount_is_zero(&hdr->b_refcnt)) {
- ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3U(*size, >=, hdr->b_size);
+ atomic_add_64(size, -hdr->b_size);
}
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
nhdr->b_type = type;
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
- nhdr->b_flags = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
nhdr->b_freeze_cksum = NULL;
- mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
+ rw_exit(&buf->b_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
-
- hdr = nhdr;
} else {
+ rw_exit(&buf->b_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
mutex_exit(hash_lock);
+
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
@@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag)
}
buf->b_efunc = NULL;
buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ mutex_exit(&l2arc_buflist_mtx);
+ }
}
int
arc_released(arc_buf_t *buf)
{
- return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ int released;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ rw_exit(&buf->b_lock);
+ return (released);
}
int
arc_has_callback(arc_buf_t *buf)
{
- return (buf->b_efunc != NULL);
+ int callback;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ callback = (buf->b_efunc != NULL);
+ rw_exit(&buf->b_lock);
+ return (callback);
}
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
{
- return (refcount_count(&buf->b_hdr->b_refcnt));
+ int referenced;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ rw_exit(&buf->b_lock);
+ return (referenced);
}
#endif
@@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- if (callback->awcb_ready) {
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
- callback->awcb_ready(zio, buf, callback->awcb_private);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum.
+ * It is the responsibility of the callback to handle the
+ * accounting for any re-write attempt.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
}
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
static void
@@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio)
hdr->b_acb = NULL;
- /* this buffer is on no lists and is not in the hash table */
- ASSERT3P(hdr->b_state, ==, arc_anon);
-
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = zio->io_bp->blk_birth;
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio)
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
BP_IDENTITY(zio->io_bp)));
ASSERT3U(zio->io_bp_orig.blk_birth, ==,
@@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio)
ASSERT3P(exists, ==, NULL);
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- arc_access(hdr, hash_lock);
+ /* if it's not anon, we are doing a scrub */
+ if (hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else if (callback->awcb_done == NULL) {
int destroy_hdr;
@@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio)
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
+ hdr->b_flags &= ~ARC_STORED;
if (callback->awcb_done) {
ASSERT(!refcount_is_zero(&hdr->b_refcnt));
@@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio)
kmem_free(callback, sizeof (arc_write_callback_t));
}
+static void
+write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
+{
+ boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
+
+ /* Determine checksum setting */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
+ !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
+ zp->zp_checksum = wp->wp_oschecksum;
+ else
+ zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
+ wp->wp_oschecksum);
+ }
+
+ /* Determine compression setting */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ zp->zp_compress = zio_compress_select(wp->wp_dncompress,
+ wp->wp_oscompress);
+ }
+
+ zp->zp_type = wp->wp_type;
+ zp->zp_level = wp->wp_level;
+ zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
+}
+
zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+ boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
- zio_t *zio;
+ zio_t *zio;
+ zio_prop_t zp;
- /* this is a private buffer - no locking required */
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(ready != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
ASSERT(hdr->b_acb == 0);
+ if (l2arc)
+ hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
- zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
- buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
- priority, flags, zb);
+
+ write_policy(spa, wp, &zp);
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
@@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
* nonzero, it should match what we have in the cache.
*/
ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
+ bp->blk_fill == BLK_FILL_ALREADY_FREED);
+
if (ab->b_state != arc_anon)
arc_change_state(arc_anon, ab, hash_lock);
if (HDR_IO_IN_PROGRESS(ab)) {
@@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
} else if (refcount_is_zero(&ab->b_refcnt)) {
+ ab->b_flags |= ARC_FREE_IN_PROGRESS;
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
ARCSTAT_BUMP(arcstat_deleted);
@@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
}
- zio = zio_free(pio, spa, txg, bp, done, private);
+ zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
if (arc_flags & ARC_WAIT)
return (zio_wait(zio));
@@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
return (0);
}
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t inflight_data = arc_anon->arcs_size;
+ uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
+ static uint64_t page_load = 0;
+ static uint64_t last_txg = 0;
+
+#if 0
+#if defined(__i386)
+ available_memory =
+ MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+#endif
+ if (available_memory >= zfs_write_limit_max)
+ return (0);
+
+ if (txg > last_txg) {
+ last_txg = txg;
+ page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == pageproc) {
+ if (page_load > available_memory / 4)
+ return (ERESTART);
+ /* Note: reserve is inflated, so we deflate */
+ page_load += reserve / 8;
+ return (0);
+ } else if (page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (EAGAIN);
+ }
+ page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (ERESTART);
+ }
+#endif
+ return (0);
+}
+
void
-arc_tempreserve_clear(uint64_t tempreserve)
+arc_tempreserve_clear(uint64_t reserve)
{
- atomic_add_64(&arc_tempreserve, -tempreserve);
+ atomic_add_64(&arc_tempreserve, -reserve);
ASSERT((int64_t)arc_tempreserve >= 0);
}
int
-arc_tempreserve_space(uint64_t tempreserve)
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
+ int error;
+
#ifdef ZFS_DEBUG
/*
* Once in a while, fail for no reason. Everything should cope.
@@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve)
return (ERESTART);
}
#endif
- if (tempreserve > arc_c/4 && !arc_no_grow)
- arc_c = MIN(arc_c_max, tempreserve * 4);
- if (tempreserve > arc_c)
+ if (reserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, reserve * 4);
+ if (reserve > arc_c)
return (ENOMEM);
/*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefor need to
+ * make sure that there is sufficient available memory for this.
+ */
+ if (error = arc_memory_throttle(reserve, txg))
+ return (error);
+
+ /*
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
- *
- * XXX The limit should be adjusted dynamically to keep the time
- * to sync a dataset fixed (around 1-5 seconds?).
*/
-
- if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
- dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
- "tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
- tempreserve>>10, arc_c>>10);
+ if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+ reserve>>10, arc_c>>10);
return (ERESTART);
}
- atomic_add_64(&arc_tempreserve, tempreserve);
+ atomic_add_64(&arc_tempreserve, reserve);
return (0);
}
@@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused)
/* Serialize access via arc_lowmem_lock. */
mutex_enter(&arc_lowmem_lock);
- zfs_needfree = 1;
+ needfree = 1;
cv_signal(&arc_reclaim_thr_cv);
- while (zfs_needfree)
- tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+ while (needfree)
+ tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
mutex_exit(&arc_lowmem_lock);
}
#endif
@@ -2743,6 +3442,16 @@ arc_init(void)
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -2757,6 +3466,7 @@ arc_init(void)
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -2764,15 +3474,28 @@ arc_init(void)
mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
buf_init();
@@ -2798,6 +3521,13 @@ arc_init(void)
#endif
arc_dead = FALSE;
+ arc_warm = B_FALSE;
+
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
#ifdef _KERNEL
/* Warn about ZFS memory and address space requirements. */
@@ -2808,9 +3538,9 @@ arc_init(void)
if (kmem_size() < 512 * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
"expect unstable behavior.\n");
- printf(" Consider tuning vm.kmem_size and "
+ printf(" Consider tuning vm.kmem_size and "
"vm.kmem_size_max\n");
- printf(" in /boot/loader.conf.\n");
+ printf(" in /boot/loader.conf.\n");
}
#endif
}
@@ -2818,6 +3548,7 @@ arc_init(void)
void
arc_fini(void)
{
+
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
cv_signal(&arc_reclaim_thr_cv);
@@ -2825,7 +3556,7 @@ arc_fini(void)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
mutex_exit(&arc_reclaim_thr_lock);
- arc_flush();
+ arc_flush(NULL);
arc_dead = TRUE;
@@ -2838,10 +3569,14 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc_mru->arcs_list);
- list_destroy(&arc_mru_ghost->arcs_list);
- list_destroy(&arc_mfu->arcs_list);
- list_destroy(&arc_mfu_ghost->arcs_list);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
mutex_destroy(&arc_anon->arcs_mtx);
mutex_destroy(&arc_mru->arcs_mtx);
@@ -2849,6 +3584,8 @@ arc_fini(void)
mutex_destroy(&arc_mfu->arcs_mtx);
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ mutex_destroy(&zfs_write_limit_lock);
+
buf_fini();
mutex_destroy(&arc_lowmem_lock);
@@ -2857,3 +3594,985 @@ arc_fini(void)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
#endif
}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev));
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev))
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ l2arc_buf_hdr_t *abl2;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, ab);
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ int equal;
+
+ ASSERT(zio->io_vd != NULL);
+ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ buf = cb->l2rcb_buf;
+ ASSERT(buf != NULL);
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ equal = arc_cksum_equal(buf);
+ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = buf;
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+ arc_read_done(zio);
+ } else {
+ mutex_exit(hash_lock);
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ } else {
+ zio->io_error = EIO;
+ }
+ if (!equal)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ /*
+ * If there's no waiter, issue an async i/o to the primary
+ * storage now. If there *is* a waiter, the caller must
+ * issue the i/o in a context where it's OK to block.
+ */
+ if (zio->io_waiter == NULL)
+ zio_nowait(zio_read(zio->io_parent,
+ cb->l2rcb_spa, &cb->l2rcb_bp,
+ buf->b_data, zio->io_size, arc_read_done, buf,
+ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+ list_t *list;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 1:
+ list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ case 2:
+ list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 3:
+ list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ }
+
+ ASSERT(!(MUTEX_HELD(*lock)));
+ mutex_enter(*lock);
+ return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ buflist = dev->l2ad_buflist;
+
+ if (buflist == NULL)
+ return;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&l2arc_buflist_mtx);
+ for (ab = list_tail(buflist); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&l2arc_buflist_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ if (HDR_L2_WRITE_HEAD(ab)) {
+ /*
+ * We hit a write head node. Leave it for
+ * l2arc_write_done().
+ */
+ list_remove(buflist, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (!all && ab->b_l2hdr != NULL &&
+ (ab->b_l2hdr->b_daddr > taddr ||
+ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (HDR_FREE_IN_PROGRESS(ab)) {
+ /*
+ * Already on the path to destruction.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_state == arc_l2c_only) {
+ ASSERT(!HDR_L2_READING(ab));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_size.
+ */
+ arc_change_state(arc_anon, ab, hash_lock);
+ arc_hdr_destroy(ab);
+ } else {
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(ab)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ ab->b_flags |= ARC_L2_EVICTED;
+ }
+
+ /*
+ * Tell ARC this no longer exists in L2ARC.
+ */
+ if (ab->b_l2hdr != NULL) {
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+ list_remove(buflist, ab);
+
+ /*
+ * This may have been leftover after a
+ * failed write.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static void
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+ arc_buf_hdr_t *ab, *ab_prev, *head;
+ l2arc_buf_hdr_t *hdrl2;
+ list_t *list;
+ uint64_t passed_sz, write_sz, buf_sz, headroom;
+ void *buf_data;
+ kmutex_t *hash_lock, *list_lock;
+ boolean_t have_lock, full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+ int try;
+
+ ASSERT(dev->l2ad_vdev != NULL);
+
+ pio = NULL;
+ write_sz = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head->b_flags |= ARC_L2_WRITE_HEAD;
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ for (try = 0; try <= 3; try++) {
+ list = l2arc_list_locked(try, &list_lock);
+ passed_sz = 0;
+
+ /*
+ * L2ARC fast warmup.
+ *
+ * Until the ARC is warm and starts to evict, read from the
+ * head of the ARC lists rather than the tail.
+ */
+ headroom = target_sz * l2arc_headroom;
+ if (arc_warm == B_FALSE)
+ ab = list_head(list);
+ else
+ ab = list_tail(list);
+
+ for (; ab; ab = ab_prev) {
+ if (arc_warm == B_FALSE)
+ ab_prev = list_next(list, ab);
+ else
+ ab_prev = list_prev(list, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (!have_lock && !mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += ab->b_size;
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_spa != spa) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if ((write_sz + ab->b_size) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_buf == NULL) {
+ DTRACE_PROBE1(l2arc__buf__null, void *, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ list_insert_head(dev->l2ad_buflist, head);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ /*
+ * Create and add a new L2ARC header.
+ */
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2->b_dev = dev;
+ hdrl2->b_daddr = dev->l2ad_hand;
+
+ ab->b_flags |= ARC_L2_WRITING;
+ ab->b_l2hdr = hdrl2;
+ list_insert_head(dev->l2ad_buflist, ab);
+ buf_data = ab->b_buf->b_data;
+ buf_sz = ab->b_size;
+
+ /*
+ * Compute and store the buffer cksum before
+ * writing. On debug the cksum is verified first.
+ */
+ arc_cksum_verify(ab->b_buf);
+ arc_cksum_compute(ab->b_buf, B_TRUE);
+
+ mutex_exit(hash_lock);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ /*
+ * Keep the clock hand suitably device-aligned.
+ */
+ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+ write_sz += buf_sz;
+ dev->l2ad_hand += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ if (pio == NULL) {
+ ASSERT3U(write_sz, ==, 0);
+ kmem_cache_free(hdr_cache, head);
+ return;
+ }
+
+ ASSERT3U(write_sz, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_size, write_sz);
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+ spa_l2cache_space_update(dev->l2ad_vdev, 0,
+ dev->l2ad_end - dev->l2ad_hand);
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ (void) zio_wait(pio);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void *dummy __unused)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ uint64_t size;
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ /*
+ * Pause for l2arc_feed_secs seconds between writes.
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ hz * l2arc_feed_secs);
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+ /*
+ * Quick check for L2ARC devices.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa. This
+ * will return NULL if there are now no l2arc devices or if
+ * they are all faulted.
+ *
+ * If a device is returned, its spa's config lock is also
+ * held to prevent device removal. l2arc_dev_get_next()
+ * will grab and release l2arc_dev_mtx.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL)
+ continue;
+
+ spa = dev->l2ad_spa;
+ ASSERT(spa != NULL);
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ size = dev->l2ad_write;
+ if (arc_warm == B_FALSE)
+ size += dev->l2ad_boost;
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, size, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ l2arc_write_buffers(spa, dev, size);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
+
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev != NULL;
+ dev = list_next(l2arc_dev_list, dev)) {
+ if (dev->l2ad_vdev == vd)
+ break;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ return (dev != NULL);
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+{
+ l2arc_dev_t *adddev;
+
+ ASSERT(!l2arc_vdev_present(vd));
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_write = l2arc_write_max;
+ adddev->l2ad_boost = l2arc_write_boost;
+ adddev->l2ad_start = start;
+ adddev->l2ad_end = end;
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ ASSERT3U(adddev->l2ad_write, >, 0);
+
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2node));
+
+ spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT(remdev != NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(remdev->l2ad_buflist);
+ kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+ /*
+ * This is called from dmu_fini(), which is called from spa_fini();
+ * Because of this, we can assume that all l2arc devices have
+ * already been removed when the pools themselves were removed.
+ */
+
+ l2arc_do_free_on_write();
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_buflist_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+ if (!(spa_mode & FWRITE))
+ return;
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+ if (!(spa_mode & FWRITE))
+ return;
+
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
index 4442b1f28ac8..93b7741d77be 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/bplist.h>
#include <sys/zfs_context.h>
@@ -47,7 +45,7 @@ bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
{
int size;
- size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
+ size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
@@ -181,7 +179,7 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
}
int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
{
uint64_t blk, off;
blkptr_t *bparray;
@@ -229,7 +227,7 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
* Deferred entry; will be written later by bplist_sync().
*/
void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
{
bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
@@ -278,9 +276,7 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
int
bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
- uint64_t itor = 0, comp = 0, uncomp = 0;
int err;
- blkptr_t bp;
mutex_enter(&bpl->bpl_lock);
@@ -298,6 +294,9 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
mutex_exit(&bpl->bpl_lock);
if (!bpl->bpl_havecomp) {
+ uint64_t itor = 0, comp = 0, uncomp = 0;
+ blkptr_t bp;
+
while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
comp += BP_GET_PSIZE(&bp);
uncomp += BP_GET_UCSIZE(&bp);
@@ -310,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
return (err);
}
+
+/*
+ * Return (in *dasizep) the amount of space on the deadlist which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *dasizep)
+{
+ uint64_t size = 0;
+ uint64_t itor = 0;
+ blkptr_t bp;
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpl_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
+ mutex_enter(&bpl->bpl_lock);
+ err = bplist_hold(bpl);
+ if (err == 0)
+ *dasizep = bpl->bpl_phys->bpl_bytes;
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+ if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
+ size +=
+ bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+ }
+ }
+ if (err == ENOENT)
+ err = 0;
+ *dasizep = size;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 94c63081478a..2494c1e7f9d1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@@ -39,17 +37,10 @@
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static arc_done_func_t dbuf_write_ready;
static arc_done_func_t dbuf_write_done;
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
/*
* Global data structures and functions for the dbuf cache.
*/
@@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- blkptr_t *bp;
+ dnode_t *dn = db->db_dnode;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
+ arc_buf_t *pbuf;
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ int bonuslen = dn->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
+ arc_space_consume(DN_MAX_BONUSLEN);
+ if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+ bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
}
- if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
- bp = NULL;
- else
- bp = db->db_blkptr;
-
- if (bp == NULL)
- dprintf_dbuf(db, "blkptr: %s\n", "NULL");
- else
- dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
- if (bp == NULL || BP_IS_HOLE(bp)) {
+ /*
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- ASSERT(bp == NULL || BP_IS_HOLE(bp));
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
@@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_L2CACHE;
+
zb.zb_objset = db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : 0;
zb.zb_object = db->db.db_object;
@@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
- ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
- (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
- db->db_level > 0 ? byteswap_uint64_array :
- dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+ if (db->db_parent)
+ pbuf = db->db_parent->db_buf;
+ else
+ pbuf = db->db_objset->os_phys_buf;
+
+ (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
@@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DB_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
@@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/* free this block */
if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
/* XXX can get silent EIO here */
- (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ (void) dsl_free(NULL,
+ spa_get_dsl(db->db_dnode->dn_objset->os_spa),
txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
arc_release(dr->dt.dl.dr_data, db);
}
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t first_l1 = start >> epbs;
+ uint64_t last_l1 = end >> epbs;
- dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ if (end > dn->dn_maxblkid) {
+ end = dn->dn_maxblkid;
+ last_l1 = end >> epbs;
+ }
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ if (db->db_level == 1 &&
+ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_last_dirty &&
+ db->db_last_dirty->dr_txg < txg) {
+ dbuf_add_ref(db, FTAG);
+ mutex_exit(&db->db_mtx);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < blkid ||
- db->db_blkid >= blkid+nblks)
+ if (db->db_blkid < start || db->db_blkid > end)
continue;
/* found a level 0 buffer in the range */
@@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
if (db->db_last_dirty)
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
+ /* If we don't exist or are in a snapshot, we can't be freed */
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
else
- return (TRUE);
+ return (FALSE);
}
void
@@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
+ boolean_t do_free_accounting = B_FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(tx->tx_txg != 0);
@@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drp = &db->db_last_dirty;
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
db->db.db_object == DMU_META_DNODE_OBJECT);
- while (*drp && (*drp)->dr_txg > tx->tx_txg)
- drp = &(*drp)->dr_next;
- if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
- dbuf_unoverride(*drp);
+ dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
- return (*drp);
+ return (dr);
}
/*
@@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ * Note: we delay "free accounting" until after we drop
+ * the db_mtx. This keeps us from grabbing other locks
+ * (and possibly deadlocking) in bp_get_dasize() while
+ * also holding the db_mtx.
+ */
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ do_free_accounting = dbuf_block_freeable(db);
+ }
+
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
@@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_freed_in_flight = FALSE;
}
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
/*
* This buffer is now part of this txg
*/
@@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
return (dr);
- }
-
- if (db->db_level == 0) {
- dnode_new_blkid(dn, db->db_blkid, tx);
- ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ } else if (do_free_accounting) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
}
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
@@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drop_struct_lock = TRUE;
}
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
if (db->db_level+1 < dn->dn_nlevels) {
dmu_buf_impl_t *parent = db->db_parent;
dbuf_dirty_record_t *di;
@@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
+ dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* If this buffer is not dirty, we're done.
*/
- for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
if (dr->dr_txg <= txg)
break;
if (dr == NULL || dr->dr_txg < txg) {
@@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/* XXX would be nice to fix up dn_towrite_space[] */
- db->db_last_dirty = dr->dr_next;
+ *drp = dr->dr_next;
if (dr->dr_parent) {
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_level+1 == dn->dn_nlevels) {
- ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
@@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
void
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- int rf = DB_RF_MUST_SUCCEED;
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID)
+ if (db->db_blkid == DB_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
}
@@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
if (db->db_buf)
@@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t));
return (db);
} else {
int blocksize =
@@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
list_insert_head(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t));
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
@@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
-
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (list_link_active(&db->db_link)) {
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
- db->db_dnode = NULL;
db->db_buf = NULL;
+ ASSERT(!list_link_active(&db->db_link));
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t));
}
void
@@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ arc_buf_t *pbuf;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
zb.zb_level = 0;
zb.zb_blkid = blkid;
- (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
- dmu_ot[dn->dn_type].ot_byteswap,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ if (db)
+ pbuf = db->db_buf;
+ else
+ pbuf = dn->dn_objset->os_phys_buf;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
return (err ? NULL : db);
}
-dmu_buf_impl_t *
+void
dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_evict(db);
} else {
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
- mutex_exit(&db->db_mtx);
+ if (!DBUF_IS_CACHEABLE(db))
+ dbuf_clear(db);
+ else
+ mutex_exit(&db->db_mtx);
}
} else {
mutex_exit(&db->db_mtx);
@@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
db->db_data_pending = dr;
- arc_release(db->db_buf, db);
mutex_exit(&db->db_mtx);
-
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
- zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+ dbuf_write(dr, db->db_buf, tx);
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
@@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- int checksum, compress;
int blksz;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
if (db->db_blkid == DB_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
+
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
- if (*datap != db->db.db_data)
+ if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
while (*drp != dr)
drp = &(*drp)->dr_next;
- ASSERT((*drp)->dr_next == NULL);
- *drp = NULL;
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
/*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
* If this buffer is in the middle of an immdiate write,
* wait for the synchronous IO to complete.
*/
@@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- dbuf_check_blkptr(dn, db);
-
/*
* If this dbuf has already been written out via an immediate write,
* just complete the write by copying over the new block pointer and
@@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
zio_fake.io_bp = db->db_blkptr;
zio_fake.io_bp_orig = *db->db_blkptr;
zio_fake.io_txg = txg;
+ zio_fake.io_flags = 0;
*db->db_blkptr = dr->dt.dl.dr_overridden_by;
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dr->dr_zio = &zio_fake;
mutex_exit(&db->db_mtx);
+ ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
+ BP_IDENTITY(&zio_fake.io_bp_orig)) ||
+ BP_IS_HOLE(zio_fake.io_bp));
+
if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
&zio_fake.io_bp_orig, dn->dn_zio, tx);
dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
- } else {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(db->db_buf, db);
}
ASSERT(*datap != NULL);
@@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
-
- dbuf_write(dr, *datap, checksum, compress, tx);
+ dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
}
static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn = db->db_dnode;
@@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
+
+ if (!BP_IS_HOLE(db->db_blkptr) &&
+ (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(data, db);
+ } else {
+ ASSERT(arc_released(data));
+ /* XXX why do we need to thaw here? */
+ arc_buf_thaw(data);
+ }
if (parent != dn->dn_dbuf) {
ASSERT(parent && parent->db_data_pending);
@@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
+ wp.wp_type = dn->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dncompress = dn->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+ wp.wp_dnchecksum = dn->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+
if (BP_IS_OLDER(db->db_blkptr, txg))
- dsl_dataset_block_kill(
+ (void) dsl_dataset_block_kill(
os->os_dsl_dataset, db->db_blkptr, zio, tx);
- dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
- dmu_get_replication_level(os, &zb, dn->dn_type), txg,
- db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+ dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+ DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+ data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
/* ARGSUSED */
@@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_impl_t *db = vdb;
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t fill = 0;
int old_size, new_size, i;
+ ASSERT(db->db_blkptr == bp);
+
dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+ new_size = bp_get_dasize(os->os_spa, bp);
- dnode_diduse_space(dn, new_size-old_size);
+ dnode_diduse_space(dn, new_size - old_size);
- if (BP_IS_HOLE(zio->io_bp)) {
+ if (BP_IS_HOLE(bp)) {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ ASSERT3U(bp->blk_fill, ==, 0);
return;
}
+ ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
mutex_enter(&db->db_mtx);
if (db->db_level == 0) {
@@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill = 1;
}
} else {
- blkptr_t *bp = db->db.db_data;
+ blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
- if (BP_IS_HOLE(bp))
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
continue;
- ASSERT3U(BP_GET_LSIZE(bp), ==,
+ ASSERT3U(BP_GET_LSIZE(ibp), ==,
db->db_level == 1 ? dn->dn_datablksz :
(1<<dn->dn_phys->dn_indblkshift));
- fill += bp->blk_fill;
+ fill += ibp->blk_fill;
}
}
- db->db_blkptr->blk_fill = fill;
- BP_SET_TYPE(db->db_blkptr, dn->dn_type);
- BP_SET_LEVEL(db->db_blkptr, db->db_level);
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
- /* We must do this after we've set the bp's type and level */
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ } else {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- dsl_dataset_block_born(ds, zio->io_bp, tx);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
@@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_enter(&db->db_mtx);
drp = &db->db_last_dirty;
- while (*drp != db->db_data_pending)
- drp = &(*drp)->dr_next;
- ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
- ASSERT((*drp)->dr_txg == txg);
- ASSERT((*drp)->dr_next == NULL);
- dr = *drp;
- *drp = NULL;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (db->db_level == 0) {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index d3be6b4ff22e..377efb9d105e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
@@ -42,6 +40,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "DSL props" },
{ byteswap_uint64_array, TRUE, "DSL dataset" },
{ zfs_znode_byteswap, TRUE, "ZFS znode" },
- { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
{ byteswap_uint8_array, FALSE, "ZFS plain file" },
{ zap_byteswap, TRUE, "ZFS directory" },
{ zap_byteswap, TRUE, "ZFS master node" },
@@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "persistent error log" },
{ byteswap_uint8_array, TRUE, "SPA history" },
{ byteswap_uint64_array, TRUE, "SPA history offsets" },
- { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "DSL permissions" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
+ { byteswap_uint8_array, TRUE, "FUID table" },
+ { byteswap_uint64_array, TRUE, "FUID table size" },
+ { zap_byteswap, TRUE, "DSL dataset next clones"},
+ { zap_byteswap, TRUE, "scrub work queue" },
};
int
@@ -115,6 +121,19 @@ dmu_bonus_max(void)
return (DN_MAX_BONUSLEN);
}
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -122,27 +141,27 @@ int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
- int err, count;
dmu_buf_impl_t *db;
+ int error;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
+ dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -161,11 +180,13 @@ static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
+ dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t flags;
int err;
zio_t *zio;
+ hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ if (dn->dn_objset->os_dsl_dataset)
+ dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
+ if (dp && dsl_pool_sync_context(dp))
+ start = gethrtime();
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
@@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
/* wait for async i/o */
err = zio_wait(zio);
+ /* track read overhead when we are in sync context */
+ if (dp && dsl_pool_sync_context(dp))
+ dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
@@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
dnode_rele(dn, FTAG);
}
+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+ uint64_t len = *offset - limit;
+ uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+ uint64_t subchunk =
+ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT(limit <= *offset);
+
+ if (len <= chunk_len) {
+ *offset = limit;
+ return (0);
+ }
+
+ ASSERT(ISP2(subchunk));
+
+ while (*offset > limit) {
+ uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
+ uint64_t delta;
+ int err;
+
+ /* skip over allocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ ASSERT3U(*offset, <=, initial_offset);
+ *offset = P2ALIGN(*offset, subchunk);
+ delta = initial_offset - *offset;
+ if (delta >= chunk_len) {
+ *offset += delta - chunk_len;
+ return (0);
+ }
+ chunk_len -= delta;
+
+ /* skip over unallocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ if (*offset < limit)
+ *offset = limit;
+ ASSERT3U(*offset, <, initial_offset);
+ }
+ return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length, boolean_t free_dnode)
+{
+ dmu_tx_t *tx;
+ uint64_t object_size, start, end, len;
+ boolean_t trunc = (length == DMU_OBJECT_END);
+ int align, err;
+
+ align = 1 << dn->dn_datablkshift;
+ ASSERT(align > 0);
+ object_size = align == 1 ? dn->dn_datablksz :
+ (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+ if (trunc || (end = offset + length) > object_size)
+ end = object_size;
+ if (end <= offset)
+ return (0);
+ length = end - offset;
+
+ while (length) {
+ start = end;
+ err = get_next_chunk(dn, &start, offset);
+ if (err)
+ return (err);
+ len = trunc ? DMU_OBJECT_END : end - start;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+ if (start == 0 && free_dnode) {
+ ASSERT(trunc);
+ dnode_free(dn, tx);
+ }
+
+ length -= end - start;
+
+ dmu_tx_commit(tx);
+ end = start;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_t *tx;
+ int err;
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err != 0)
+ return (err);
+ if (dn->dn_nlevels == 1) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ } else {
+ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
@@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp);
if (err)
- return (err);
+ break;
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
void
@@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
- va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ va = zfs_map_page(pp, S_READ);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
- ppmapout(va);
+ zfs_unmap_page(pp, va);
pp = pp->p_next;
bufoff += PAGESIZE;
}
@@ -620,6 +796,22 @@ typedef struct {
/* ARGSUSED */
static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (!BP_IS_HOLE(bp)) {
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
+}
+
+/* ARGSUSED */
+static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *in = varg;
@@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dmu_buf_impl_t *db = dr->dr_dbuf;
dmu_sync_cb_t *done = in->done;
- if (!BP_IS_HOLE(zio->io_bp)) {
- zio->io_bp->blk_fill = 1;
- BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
- BP_SET_LEVEL(zio->io_bp, 0);
- }
-
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
@@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *in;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
int err;
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
-
dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
@@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- zio = arc_write(pio, os->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
- dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
- txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ wp.wp_type = db->db_dnode->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dnchecksum = db->db_dnode->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+ wp.wp_dncompress = db->db_dnode->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+
+ ASSERT(BP_IS_HOLE(bp));
+
+ zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
+ txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
if (pio) {
zio_nowait(zio);
@@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
}
int
-dmu_get_replication_level(objset_impl_t *os,
- zbookmark_t *zb, dmu_object_type_t ot)
-{
- int ncopies = os->os_copies;
-
- /* If it's the mos, it should have max copies set. */
- ASSERT(zb->zb_objset != 0 ||
- ncopies == spa_max_replication(os->os_spa));
-
- if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
- ncopies++;
- return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
@@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
return (err);
}
- err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
@@ -1018,6 +1193,7 @@ dmu_init(void)
dbuf_init();
dnode_init();
arc_init();
+ l2arc_init();
}
void
@@ -1026,4 +1202,5 @@ dmu_fini(void)
arc_fini();
dnode_fini();
dbuf_fini();
+ l2arc_fini();
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 93168cc8901f..1b9247d66e65 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,7 +54,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
int error = dnode_next_offset(osi->os_meta_dnode,
- B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ DNODE_FIND_HOLE,
+ &offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
if (error == 0)
object = offset >> DNODE_SHIFT;
@@ -139,6 +140,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
@@ -152,7 +154,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
int error;
error = dnode_next_offset(os->os->os_meta_dnode,
- hole, &offset, 0, DNODES_PER_BLOCK, txg);
+ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 378fe8c15bc0..7981e06825c4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,12 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/cred.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
@@ -32,6 +31,7 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
#include <sys/dnode.h>
#include <sys/dbuf.h>
#include <sys/zvol.h>
@@ -40,7 +40,7 @@
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
-
+#include <sys/zfs_ioctl.h>
spa_t *
dmu_objset_spa(objset_t *os)
@@ -131,6 +131,34 @@ copies_changed_cb(void *arg, uint64_t newval)
osi->os_copies = newval;
}
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ osi->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ osi->os_secondary_cache = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -146,8 +174,10 @@ int
dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
objset_impl_t **osip)
{
- objset_impl_t *winner, *osi;
- int i, err, checksum;
+ objset_impl_t *osi;
+ int i, err;
+
+ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
osi->os.os = osi;
@@ -161,18 +191,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zb.zb_object = 0;
zb.zb_level = -1;
zb.zb_blkid = 0;
+ if (DMU_OS_IS_L2CACHEABLE(osi))
+ aflags |= ARC_L2CACHE;
dprintf_bp(osi->os_rootbp, "reading %s", "");
- err = arc_read(NULL, spa, osi->os_rootbp,
- dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ /*
+ * NB: when bprewrite scrub can change the bp,
+ * and this is called from dmu_objset_open_ds_os, the bp
+ * could change, and we'll need a lock.
+ */
+ err = arc_read_nolock(NULL, spa, osi->os_rootbp,
arc_getbuf_func, &osi->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
kmem_free(osi, sizeof (objset_impl_t));
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
return (err);
}
osi->os_phys = osi->os_phys_buf->b_data;
- arc_release(osi->os_phys_buf, &osi->os_phys_buf);
} else {
osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
&osi->os_phys_buf, ARC_BUFC_METADATA);
@@ -183,18 +221,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/*
* Note: the changed_cb will be called once before the register
* func returns, thus changing the checksum/compression from the
- * default (fletcher2/off). Snapshots don't need to know, and
- * registering would complicate clone promotion.
+ * default (fletcher2/off). Snapshots don't need to know about
+ * checksum/compression/copies.
*/
- if (ds && ds->ds_phys->ds_num_children == 0) {
- err = dsl_prop_register(ds, "checksum",
- checksum_changed_cb, osi);
- if (err == 0)
- err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
+ if (ds) {
+ err = dsl_prop_register(ds, "primarycache",
+ primary_cache_changed_cb, osi);
if (err == 0)
- err = dsl_prop_register(ds, "copies",
- copies_changed_cb, osi);
+ err = dsl_prop_register(ds, "secondarycache",
+ secondary_cache_changed_cb, osi);
+ if (!dsl_dataset_is_snapshot(ds)) {
+ if (err == 0)
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "copies",
+ copies_changed_cb, osi);
+ }
if (err) {
VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
&osi->os_phys_buf) == 1);
@@ -206,24 +252,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
osi->os_compress = ZIO_COMPRESS_LZJB;
osi->os_copies = spa_max_replication(spa);
+ osi->os_primary_cache = ZFS_CACHE_ALL;
+ osi->os_secondary_cache = ZFS_CACHE_ALL;
}
- osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
- /*
- * Metadata always gets compressed and checksummed.
- * If the data checksum is multi-bit correctable, and it's not
- * a ZBT-style checksum, then it's suitable for metadata as well.
- * Otherwise, the metadata checksum defaults to fletcher4.
- */
- checksum = osi->os_checksum;
-
- if (zio_checksum_table[checksum].ci_correctable &&
- !zio_checksum_table[checksum].ci_zbt)
- osi->os_md_checksum = checksum;
- else
- osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_md_compress = ZIO_COMPRESS_LZJB;
+ osi->os_zil_header = osi->os_phys->os_zil_header;
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -238,70 +272,118 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
osi->os_meta_dnode = dnode_special_open(osi,
&osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
- if (ds != NULL) {
- winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
- if (winner) {
- dmu_objset_evict(ds, osi);
- osi = winner;
- }
+ /*
+ * We should be the only thread trying to do this because we
+ * have ds_opening_lock
+ */
+ if (ds) {
+ VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
+ dmu_objset_evict));
}
*osip = osi;
return (0);
}
-/* called from zpl */
-int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp)
+static int
+dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
{
- dsl_dataset_t *ds;
- int err;
- objset_t *os;
objset_impl_t *osi;
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- err = dsl_dataset_open(name, mode, os, &ds);
- if (err) {
- kmem_free(os, sizeof (objset_t));
- return (err);
- }
-
+ mutex_enter(&ds->ds_opening_lock);
osi = dsl_dataset_get_user_ptr(ds);
if (osi == NULL) {
+ int err;
+
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
ds, &ds->ds_phys->ds_bp, &osi);
if (err) {
- dsl_dataset_close(ds, mode, os);
- kmem_free(os, sizeof (objset_t));
+ mutex_exit(&ds->ds_opening_lock);
return (err);
}
}
+ mutex_exit(&ds->ds_opening_lock);
os->os = osi;
- os->os_mode = mode;
+ os->os_mode = DS_MODE_NOHOLD;
- if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
- dmu_objset_close(os);
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
return (EINVAL);
- }
- *osp = os;
return (0);
}
+int
+dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+{
+ objset_t *os;
+ int err;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dmu_objset_open_ds_os(ds, os, type);
+ if (err)
+ kmem_free(os, sizeof (objset_t));
+ else
+ *osp = os;
+ return (err);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ objset_t *os;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
+ DS_MODE_TYPE(mode) == DS_MODE_OWNER);
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+ err = dsl_dataset_hold(name, os, &ds);
+ else
+ err = dsl_dataset_own(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ err = dmu_objset_open_ds_os(ds, os, type);
+ if (err) {
+ if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+ dsl_dataset_rele(ds, os);
+ else
+ dsl_dataset_disown(ds, os);
+ kmem_free(os, sizeof (objset_t));
+ } else {
+ os->os_mode = mode;
+ *osp = os;
+ }
+ return (err);
+}
+
void
dmu_objset_close(objset_t *os)
{
- dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
+ DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
+ DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+
+ if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
+ dsl_dataset_rele(os->os->os_dsl_dataset, os);
+ else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
+ dsl_dataset_disown(os->os->os_dsl_dataset, os);
kmem_free(os, sizeof (objset_t));
}
int
-dmu_objset_evict_dbufs(objset_t *os, int try)
+dmu_objset_evict_dbufs(objset_t *os)
{
objset_impl_t *osi = os->os;
dnode_t *dn;
@@ -319,34 +401,25 @@ dmu_objset_evict_dbufs(objset_t *os, int try)
* skip.
*/
for (dn = list_head(&osi->os_dnodes);
- dn && refcount_is_zero(&dn->dn_holds);
+ dn && !dnode_add_ref(dn, FTAG);
dn = list_next(&osi->os_dnodes, dn))
continue;
- if (dn)
- dnode_add_ref(dn, FTAG);
while (dn) {
dnode_t *next_dn = dn;
do {
next_dn = list_next(&osi->os_dnodes, next_dn);
- } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
- if (next_dn)
- dnode_add_ref(next_dn, FTAG);
+ } while (next_dn && !dnode_add_ref(next_dn, FTAG));
mutex_exit(&osi->os_lock);
- if (dnode_evict_dbufs(dn, try)) {
- dnode_rele(dn, FTAG);
- if (next_dn)
- dnode_rele(next_dn, FTAG);
- return (1);
- }
+ dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&osi->os_lock);
dn = next_dn;
}
mutex_exit(&osi->os_lock);
- return (0);
+ return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
}
void
@@ -361,13 +434,19 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
}
- if (ds && ds->ds_phys->ds_num_children == 0) {
- VERIFY(0 == dsl_prop_unregister(ds, "checksum",
- checksum_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "compression",
- compression_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "copies",
- copies_changed_cb, osi));
+ if (ds) {
+ if (!dsl_dataset_is_snapshot(ds)) {
+ VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "copies",
+ copies_changed_cb, osi));
+ }
+ VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
+ primary_cache_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
+ secondary_cache_changed_cb, osi));
}
/*
@@ -375,7 +454,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
* nothing can be added to the list at this point.
*/
os.os = osi;
- (void) dmu_objset_evict_dbufs(&os, 0);
+ (void) dmu_objset_evict_dbufs(&os);
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
@@ -387,6 +466,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock);
+ mutex_destroy(&osi->os_user_ptr_lock);
kmem_free(osi, sizeof (objset_impl_t));
}
@@ -399,7 +479,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
+ if (ds)
+ mutex_enter(&ds->ds_opening_lock);
VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+ if (ds)
+ mutex_exit(&ds->ds_opening_lock);
mdn = osi->os_meta_dnode;
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -443,14 +527,15 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
}
struct oscarg {
- void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
void *userarg;
dsl_dataset_t *clone_parent;
const char *lastname;
dmu_objset_type_t type;
+ uint64_t flags;
};
-/* ARGSUSED */
+/*ARGSUSED*/
static int
dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -478,11 +563,12 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (oa->clone_parent->ds_phys->ds_num_children == 0)
return (EINVAL);
}
+
return (0);
}
static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct oscarg *oa = arg2;
@@ -493,10 +579,9 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_parent, tx);
+ oa->clone_parent, oa->flags, cr, tx);
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
bp = dsl_dataset_get_blkptr(ds);
if (BP_IS_HOLE(bp)) {
objset_impl_t *osi;
@@ -506,15 +591,19 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ds, bp, oa->type, tx);
if (oa->userfunc)
- oa->userfunc(&osi->os, oa->userarg, tx);
+ oa->userfunc(&osi->os, oa->userarg, cr, tx);
}
- dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+ spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", dsobj);
+
+ dsl_dataset_rele(ds, FTAG);
}
int
dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent,
- void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+ objset_t *clone_parent, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
{
dsl_dir_t *pdd;
const char *tail;
@@ -536,6 +625,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
oa.userarg = arg;
oa.lastname = tail;
oa.type = type;
+ oa.flags = flags;
+
if (clone_parent != NULL) {
/*
* You can't clone to a different type.
@@ -564,33 +655,47 @@ dmu_objset_destroy(const char *name)
* It would be nicer to do this in dsl_dataset_destroy_sync(),
* but the replay log objset is modified in open context.
*/
- error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ error = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
if (error == 0) {
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
zil_destroy(dmu_objset_zil(os), B_FALSE);
- dmu_objset_close(os);
+
+ error = dsl_dataset_destroy(ds, os);
+ /*
+ * dsl_dataset_destroy() closes the ds.
+ */
+ kmem_free(os, sizeof (objset_t));
}
- return (dsl_dataset_destroy(name));
+ return (error);
}
+/*
+ * This will close the objset.
+ */
int
-dmu_objset_rollback(const char *name)
+dmu_objset_rollback(objset_t *os)
{
int err;
- objset_t *os;
+ dsl_dataset_t *ds;
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err == 0) {
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0)
- zil_resume(dmu_objset_zil(os));
- if (err == 0) {
- /* XXX uncache everything? */
- err = dsl_dataset_rollback(os->os->os_dsl_dataset);
- }
+ ds = os->os->os_dsl_dataset;
+
+ if (!dsl_dataset_tryown(ds, TRUE, os)) {
dmu_objset_close(os);
+ return (EBUSY);
}
+
+ err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+
+ /*
+ * NB: we close the objset manually because the rollback
+ * actually implicitly called dmu_objset_evict(), thus freeing
+ * the objset_impl_t.
+ */
+ dsl_dataset_disown(ds, os);
+ kmem_free(os, sizeof (objset_t));
return (err);
}
@@ -598,6 +703,13 @@ struct snaparg {
dsl_sync_task_group_t *dstg;
char *snapname;
char failed[MAXPATHLEN];
+ boolean_t checkperms;
+ list_t objsets;
+};
+
+struct osnode {
+ list_node_t node;
+ objset_t *os;
};
static int
@@ -605,20 +717,25 @@ dmu_objset_snapshot_one(char *name, void *arg)
{
struct snaparg *sn = arg;
objset_t *os;
- dmu_objset_stats_t stat;
int err;
(void) strcpy(sn->failed, name);
- err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ /*
+ * Check permissions only when requested. This only applies when
+ * doing a recursive snapshot. The permission checks for the starting
+ * dataset have already been performed in zfs_secpolicy_snapshot()
+ */
+ if (sn->checkperms == B_TRUE &&
+ (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+ return (err);
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
if (err != 0)
return (err);
- /*
- * If the objset is in an inconsistent state, return busy.
- */
- dmu_objset_fast_stat(os, &stat);
- if (stat.dds_inconsistent) {
+ /* If the objset is in an inconsistent state, return busy */
+ if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
dmu_objset_close(os);
return (EBUSY);
}
@@ -630,8 +747,13 @@ dmu_objset_snapshot_one(char *name, void *arg)
*/
err = zil_suspend(dmu_objset_zil(os));
if (err == 0) {
+ struct osnode *osn;
dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
- dsl_dataset_snapshot_sync, os, sn->snapname, 3);
+ dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
+ sn->snapname, 3);
+ osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
+ osn->os = os;
+ list_insert_tail(&sn->objsets, osn);
} else {
dmu_objset_close(os);
}
@@ -643,31 +765,28 @@ int
dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
{
dsl_sync_task_t *dst;
+ struct osnode *osn;
struct snaparg sn = { 0 };
- char *cp;
spa_t *spa;
int err;
(void) strcpy(sn.failed, fsname);
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
+ err = spa_open(fsname, &spa, FTAG);
if (err)
return (err);
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
+ list_create(&sn.objsets, sizeof (struct osnode),
+ offsetof(struct osnode, node));
if (recursive) {
+ sn.checkperms = B_TRUE;
err = dmu_objset_find(fsname,
dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
} else {
+ sn.checkperms = B_FALSE;
err = dmu_objset_snapshot_one(fsname, &sn);
}
@@ -678,13 +797,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
- objset_t *os = dst->dst_arg1;
+ dsl_dataset_t *ds = dst->dst_arg1;
if (dst->dst_err)
- dmu_objset_name(os, sn.failed);
- zil_resume(dmu_objset_zil(os));
- dmu_objset_close(os);
+ dsl_dataset_name(ds, sn.failed);
}
+
out:
+ while (osn = list_head(&sn.objsets)) {
+ list_remove(&sn.objsets, osn);
+ zil_resume(dmu_objset_zil(osn->os));
+ dmu_objset_close(osn->os);
+ kmem_free(osn, sizeof (struct osnode));
+ }
+ list_destroy(&sn.objsets);
+
if (err)
(void) strcpy(fsname, sn.failed);
dsl_sync_task_group_destroy(sn.dstg);
@@ -717,39 +843,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
static void
ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
objset_impl_t *os = arg;
- blkptr_t *bp = os->os_rootbp;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
- int i;
+
+ ASSERT(bp == os->os_rootbp);
+ ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
/*
* Update rootbp fill count.
*/
bp->blk_fill = 1; /* count the meta-dnode */
- for (i = 0; i < dnp->dn_nblkptr; i++)
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
-}
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
- objset_impl_t *os = arg;
-
- ASSERT3U(zio->io_error, ==, 0);
-
- BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
- BP_SET_LEVEL(zio->io_bp, 0);
-
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
- BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ } else {
if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, NULL, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
- os->os_synctx);
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, zio, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
}
- arc_release(os->os_phys_buf, &os->os_phys_buf);
}
/* called from dsl */
@@ -758,10 +875,10 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
list_t *list;
dbuf_dirty_record_t *dr;
- int zio_flags;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -783,19 +900,24 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
*/
zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
zb.zb_object = 0;
- zb.zb_level = -1;
+ zb.zb_level = -1; /* for block ordering; it's level 0 on disk */
zb.zb_blkid = 0;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+
+ wp.wp_type = DMU_OT_OBJSET;
+ wp.wp_level = 0; /* on-disk BP level; see above */
+ wp.wp_copies = os->os_copies;
+ wp.wp_oschecksum = os->os_checksum;
+ wp.wp_oscompress = os->os_compress;
+
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
os->os_rootbp, pio, tx);
- zio = arc_write(pio, os->os_spa, os->os_md_checksum,
- os->os_md_compress,
- dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+ }
+
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+ zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync meta-dnode - the parent IO for the sync is the root block
@@ -819,6 +941,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
* Free intent log blocks up to this tx.
*/
zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
zio_nowait(zio);
}
@@ -867,8 +990,23 @@ dmu_objset_is_snapshot(objset_t *os)
}
int
+dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
+ boolean_t *conflict)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ uint64_t ignored;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
+ real, maxlen, conflict));
+}
+
+int
dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
- uint64_t *idp, uint64_t *offp)
+ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
{
dsl_dataset_t *ds = os->os->os_dsl_dataset;
zap_cursor_t cursor;
@@ -894,6 +1032,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
(void) strcpy(name, attr.za_name);
if (idp)
*idp = attr.za_first_integer;
+ if (case_conflict)
+ *case_conflict = attr.za_normalization_conflict;
zap_cursor_advance(&cursor);
*offp = zap_cursor_serialize(&cursor);
zap_cursor_fini(&cursor);
@@ -938,48 +1078,80 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
return (0);
}
+struct findarg {
+ int (*func)(char *, void *);
+ void *arg;
+};
+
+/* ARGSUSED */
+static int
+findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct findarg *fa = arg;
+ return (fa->func((char *)dsname, fa->arg));
+}
+
/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * Perhaps change all callers to use dmu_objset_find_spa()?
*/
int
dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
{
+ struct findarg fa;
+ fa.func = func;
+ fa.arg = arg;
+ return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+}
+
+/*
+ * Find all objsets under name, call func on each
+ */
+int
+dmu_objset_find_spa(spa_t *spa, const char *name,
+ int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+{
dsl_dir_t *dd;
- objset_t *os;
- uint64_t snapobj;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
char *child;
- int do_self, err;
+ uint64_t thisobj;
+ int err;
- err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (name == NULL)
+ name = spa_name(spa);
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
if (err)
return (err);
- /* NB: the $MOS dir doesn't have a head dataset */
- do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_close(dd, FTAG);
+ return (0);
+ }
+
+ thisobj = dd->dd_phys->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ dp = dd->dd_pool;
/*
* Iterate over all children.
*/
if (flags & DS_FIND_CHILDREN) {
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
dd->dd_phys->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT(attr->za_integer_length == sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- /*
- * No separating '/' because parent's name ends in /.
- */
child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
+ (void) strcpy(child, name);
(void) strcat(child, "/");
(void) strcat(child, attr->za_name);
- err = dmu_objset_find(child, func, arg, flags);
+ err = dmu_objset_find_spa(spa, child, func, arg, flags);
kmem_free(child, MAXPATHLEN);
if (err)
break;
@@ -996,30 +1168,36 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
/*
* Iterate over all snapshots.
*/
- if ((flags & DS_FIND_SNAPSHOTS) &&
- dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
-
- snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
- dmu_objset_close(os);
-
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
- zap_cursor_retrieve(&zc, attr) == 0;
- (void) zap_cursor_advance(&zc)) {
- ASSERT(attr->za_integer_length == sizeof (uint64_t));
- ASSERT(attr->za_num_integers == 1);
+ if (flags & DS_FIND_SNAPSHOTS) {
+ if (!dsl_pool_sync_context(dp))
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+ if (!dsl_pool_sync_context(dp))
+ rw_exit(&dp->dp_config_rwlock);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
- (void) strcat(child, "@");
- (void) strcat(child, attr->za_name);
- err = func(child, arg);
- kmem_free(child, MAXPATHLEN);
- if (err)
- break;
+ if (err == 0) {
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr->za_integer_length ==
+ sizeof (uint64_t));
+ ASSERT(attr->za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ (void) strcpy(child, name);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr->za_name);
+ err = func(spa, attr->za_first_integer,
+ child, arg);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
}
- zap_cursor_fini(&zc);
}
dsl_dir_close(dd, FTAG);
@@ -1031,7 +1209,20 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
/*
* Apply to self if appropriate.
*/
- if (do_self)
- err = func(name, arg);
+ err = func(spa, thisobj, name, arg);
return (err);
}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ return (os->os->os_user_ptr);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 3e55dc301620..1294581a7133 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,10 +41,13 @@
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+static char *dmu_recv_tag = "dmu_recv_tag";
+
struct backuparg {
dmu_replay_record_t *drr;
kthread_t *td;
struct file *fp;
+ offset_t *off;
objset_t *os;
zio_cksum_t zc;
int err;
@@ -77,6 +80,7 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
ba->err = EOPNOTSUPP;
#endif
+ *ba->off += len;
return (ba->err);
}
@@ -179,7 +183,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
void *data = bc->bc_data;
int err = 0;
- if (SIGPENDING(curthread))
+ if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
ASSERT(data || bp == NULL);
@@ -215,10 +219,9 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
zb.zb_object = object;
zb.zb_level = level;
zb.zb_blkid = blkid;
- (void) arc_read(NULL, spa, bp,
- dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
+ (void) arc_read_nolock(NULL, spa, bp,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
if (abuf) {
err = dump_data(ba, type, object, blkid * blksz,
@@ -236,13 +239,15 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
}
int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+ struct file *fp, offset_t *off)
{
dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
dmu_replay_record_t *drr;
struct backuparg ba;
int err;
+ uint64_t fromtxg = 0;
/* tosnap must be a snapshot */
if (ds->ds_phys->ds_next_snap_obj == 0)
@@ -250,26 +255,55 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
/* fromsnap must be an earlier snapshot from the same fs as tosnap */
if (fromds && (ds->ds_dir != fromds->ds_dir ||
- fromds->ds_phys->ds_creation_txg >=
- ds->ds_phys->ds_creation_txg))
+ fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
return (EXDEV);
+ if (fromorigin) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (fromsnap)
+ return (EINVAL);
+
+ if (dsl_dir_is_clone(ds->ds_dir)) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ return (err);
+ } else {
+ fromorigin = B_FALSE;
+ }
+ }
+
+
drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
drr->drr_type = DRR_BEGIN;
drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+ drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ if (fromorigin)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+
if (fromds)
drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+ if (fromds)
+ fromtxg = fromds->ds_phys->ds_creation_txg;
+ if (fromorigin)
+ dsl_dataset_rele(fromds, FTAG);
+
ba.drr = drr;
ba.td = curthread;
ba.fp = fp;
ba.os = tosnap;
+ ba.off = off;
ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
@@ -277,8 +311,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
return (ba.err);
}
- err = traverse_dsl_dataset(ds,
- fromds ? fromds->ds_phys->ds_creation_txg : 0,
+ err = traverse_dsl_dataset(ds, fromtxg,
ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
backup_cb, &ba);
@@ -303,164 +336,384 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
return (0);
}
-struct restorearg {
- int err;
- int byteswap;
- kthread_t *td;
- struct file *fp;
- char *buf;
- uint64_t voff;
- int buflen; /* number of valid bytes in buf */
- int bufoff; /* next offset to read */
- int bufsize; /* amount of memory allocated for buf */
- zio_cksum_t zc;
+struct recvbeginsyncarg {
+ const char *tofs;
+ const char *tosnap;
+ dsl_dataset_t *origin;
+ uint64_t fromguid;
+ dmu_objset_type_t type;
+ void *tag;
+ boolean_t force;
+ uint64_t dsflags;
+ char clonelastname[MAXNAMELEN];
+ dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
};
+static dsl_dataset_t *
+recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
+ cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds;
+
+ /* This should always work, since we just created it */
+ /* XXX - create should return an owned ds */
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+ DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
+
+ if (type != DMU_OST_NONE) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ ds, &ds->ds_phys->ds_bp, type, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
+ dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+
+ return (ds);
+}
+
/* ARGSUSED */
static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
- struct drr_begin *drrb = arg2;
- const char *snapname;
- int err;
+ dsl_dir_t *dd = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t val;
+ int err;
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
- return (ENODEV);
- /* must not have any changes since most recent snapshot */
- if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (ETXTBSY);
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
- /* new snapshot name must not exist */
- snapname = strrchr(drrb->drr_toname, '@');
- if (snapname == NULL)
- return (EEXIST);
+ if (rbsa->origin) {
+ /* make sure it's a snap in the same pool */
+ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
+ return (EXDEV);
+ if (rbsa->origin->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ }
- snapname++;
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
+ return (0);
+}
+
+static void
+recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+ uint64_t dsobj;
+
+ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
+ rbsa->origin, flags, cr, tx);
+
+ rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+ rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+}
+
+static int
+recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ int err;
+
+ /* must be a head ds */
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ return (EINVAL);
+
+ /* must not be a clone ds */
+ if (dsl_dir_is_clone(ds->ds_dir))
+ return (EINVAL);
+
+ err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
+ if (err)
return (err);
+ if (rbsa->origin) {
+ /* make sure it's a snap in the same pool */
+ if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
+ return (EXDEV);
+ if (rbsa->origin->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ }
+
return (0);
}
-/* ARGSUSED */
static void
-replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ struct recvbeginsyncarg *rbsa = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+ uint64_t dsobj;
+
+ /*
+ * NB: caller must provide an extra hold on the dsl_dir_t, so it
+ * won't go away when dsl_dataset_destroy_sync() closes the
+ * dataset.
+ */
+ dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
+
+ dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
+
+ rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+ rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
}
/* ARGSUSED */
static int
-replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- char *cp;
- uint64_t val;
+ dsl_dataset_t *ds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
int err;
+ uint64_t val;
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
- strrchr(drrb->drr_toname, '/') + 1,
- sizeof (uint64_t), 1, &val);
- *cp = '@';
+ /* must not have any changes since most recent snapshot */
+ if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
+ return (ETXTBSY);
+
+ /* must already be a snapshot of this fs */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ return (ENODEV);
+
+ /* most recent snapshot must match fromguid */
+ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ /* temporary clone name must not exist */
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_dir->dd_phys->dd_child_dir_zapobj,
+ rbsa->clonelastname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
if (err != ENOENT)
- return (err ? err : EEXIST);
+ return (err);
+ /* new snapshot name must not exist */
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
return (0);
}
+/* ARGSUSED */
static void
-replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- char *cp;
- dsl_dataset_t *ds;
+ dsl_dataset_t *ohds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ dsl_pool_t *dp = ohds->ds_dir->dd_pool;
+ dsl_dataset_t *ods, *cds;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
- NULL, tx);
- *cp = '@';
+ /* create the temporary clone */
+ VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
+ FTAG, &ods));
+ dsobj = dsl_dataset_create_sync(ohds->ds_dir,
+ rbsa->clonelastname, ods, flags, cr, tx);
+ dsl_dataset_rele(ods, FTAG);
+
+ /* open the temporary clone */
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+ DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_EXCLUSIVE, FTAG, &ds));
+ /* copy the refquota from the target fs to the clone */
+ if (ohds->ds_quota > 0)
+ dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
- (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
+ rbsa->ds = cds;
+
+ spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+ dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+}
+
+/* ARGSUSED */
+static void
+recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
+ ds->ds_object);
}
-static int
-replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+ boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
+ int err = 0;
+ boolean_t byteswap;
+ struct recvbeginsyncarg rbsa;
+ uint64_t version;
+ int flags;
+ dsl_dataset_t *ds;
- /* XXX verify that drr_toname is in dd */
+ if (drrb->drr_magic == DMU_BACKUP_MAGIC)
+ byteswap = FALSE;
+ else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+ byteswap = TRUE;
+ else
+ return (EINVAL);
- snapname = strchr(drrb->drr_toname, '@');
- if (snapname == NULL)
+ rbsa.tofs = tofs;
+ rbsa.tosnap = tosnap;
+ rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+ rbsa.fromguid = drrb->drr_fromguid;
+ rbsa.type = drrb->drr_type;
+ rbsa.tag = FTAG;
+ rbsa.dsflags = 0;
+ version = drrb->drr_version;
+ flags = drrb->drr_flags;
+
+ if (byteswap) {
+ rbsa.type = BSWAP_32(rbsa.type);
+ rbsa.fromguid = BSWAP_64(rbsa.fromguid);
+ version = BSWAP_64(version);
+ flags = BSWAP_32(flags);
+ }
+
+ if (version != DMU_BACKUP_STREAM_VERSION ||
+ rbsa.type >= DMU_OST_NUMTYPES ||
+ ((flags & DRR_FLAG_CLONE) && origin == NULL))
return (EINVAL);
- snapname++;
- return (dsl_dataset_snapshot_check(os, snapname, tx));
-}
+ if (flags & DRR_FLAG_CI_DATA)
+ rbsa.dsflags = DS_FLAG_CI_DATASET;
-static void
-replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
- dsl_dataset_t *ds, *hds;
+ bzero(drc, sizeof (dmu_recv_cookie_t));
+ drc->drc_drrb = drrb;
+ drc->drc_tosnap = tosnap;
+ drc->drc_force = force;
- snapname = strchr(drrb->drr_toname, '@') + 1;
+ /*
+ * Process the begin in syncing context.
+ */
+ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
+ /* offline incremental receive */
+ err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
+ if (err)
+ return (err);
- dsl_dataset_snapshot_sync(os, snapname, tx);
+ /*
+ * Only do the rollback if the most recent snapshot
+ * matches the incremental source
+ */
+ if (force) {
+ if (ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_guid !=
+ rbsa.fromguid) {
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (ENODEV);
+ }
+ (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ }
+ rbsa.force = B_FALSE;
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_incremental_check,
+ recv_offline_incremental_sync, ds, &rbsa, 1);
+ if (err) {
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (err);
+ }
+ drc->drc_logical_ds = drc->drc_real_ds = ds;
+ } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
+ /* online incremental receive */
- /* set snapshot's creation time and guid */
- hds = os->os->os_dsl_dataset;
- VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
- hds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds));
+ /* tmp clone name is: tofs/%tosnap" */
+ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
+ "%%%s", tosnap);
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
- ds->ds_phys->ds_guid = drrb->drr_toguid;
- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ /* open the dataset we are logically receiving into */
+ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+ if (err)
+ return (err);
- dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+ rbsa.force = force;
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_incremental_check,
+ recv_online_incremental_sync, ds, &rbsa, 5);
+ if (err) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (err);
+ }
+ drc->drc_logical_ds = ds;
+ drc->drc_real_ds = rbsa.ds;
+ } else {
+ /* create new fs -- full backup or clone */
+ dsl_dir_t *dd = NULL;
+ const char *tail;
+
+ err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ if (!force) {
+ dsl_dir_close(dd, FTAG);
+ return (EEXIST);
+ }
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_own_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj,
+ DS_MODE_INCONSISTENT, FTAG, &ds);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+
+ dsl_dataset_make_exclusive(ds, FTAG);
+ err = dsl_sync_task_do(dd->dd_pool,
+ recv_full_existing_check,
+ recv_full_existing_sync, ds, &rbsa, 5);
+ dsl_dataset_disown(ds, FTAG);
+ } else {
+ err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
+ recv_full_sync, dd, &rbsa, 5);
+ }
+ dsl_dir_close(dd, FTAG);
+ if (err)
+ return (err);
+ drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
+ drc->drc_newfs = B_TRUE;
+ }
- dmu_buf_will_dirty(hds->ds_dbuf, tx);
- hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ return (0);
}
+struct restorearg {
+ int err;
+ int byteswap;
+ kthread_t *td;
+ struct file *fp;
+ char *buf;
+ uint64_t voff;
+ int bufsize; /* amount of memory allocated for buf */
+ zio_cksum_t cksum;
+};
+
static int
restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
{
@@ -491,37 +744,31 @@ static void *
restore_read(struct restorearg *ra, int len)
{
void *rv;
+ int done = 0;
/* some things will require 8-byte alignment, so everything must */
ASSERT3U(len % 8, ==, 0);
- while (ra->buflen - ra->bufoff < len) {
+ while (done < len) {
int resid;
- int leftover = ra->buflen - ra->bufoff;
- (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+ ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
+ len - done, ra->voff, &resid);
- ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
- ra->bufsize - leftover, ra->voff, &resid);
-
- ra->voff += ra->bufsize - leftover - resid;
- ra->buflen = ra->bufsize - resid;
- ra->bufoff = 0;
- if (resid == ra->bufsize - leftover)
+ if (resid == len - done)
ra->err = EINVAL;
+ ra->voff += len - done - resid;
+ done = len - resid;
if (ra->err)
return (NULL);
- /* Could compute checksum here? */
}
- ASSERT3U(ra->bufoff % 8, ==, 0);
- ASSERT3U(ra->buflen - ra->bufoff, >=, len);
- rv = ra->buf + ra->bufoff;
- ra->bufoff += len;
+ ASSERT3U(done, ==, len);
+ rv = ra->buf;
if (ra->byteswap)
- fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+ fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
else
- fletcher_4_incremental_native(rv, len, &ra->zc);
+ fletcher_4_incremental_native(rv, len, &ra->cksum);
return (rv);
}
@@ -531,12 +778,14 @@ backup_byteswap(dmu_replay_record_t *drr)
#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
switch (drr->drr_type) {
case DRR_BEGIN:
DO64(drr_begin.drr_magic);
DO64(drr_begin.drr_version);
DO64(drr_begin.drr_creation_time);
DO32(drr_begin.drr_type);
+ DO32(drr_begin.drr_flags);
DO64(drr_begin.drr_toguid);
DO64(drr_begin.drr_fromguid);
break;
@@ -643,13 +892,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (data == NULL) {
dmu_tx_commit(tx);
return (ra->err);
}
- bcopy(data, db->db_data, db->db_size);
+ bcopy(data, db->db_data, drro->drr_bonuslen);
if (ra->byteswap) {
dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
drro->drr_bonuslen);
@@ -673,23 +922,14 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx;
int err;
if (dmu_object_info(os, obj, NULL) != 0)
continue;
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
+ err = dmu_free_object(os, obj);
+ if (err)
return (err);
- }
- err = dmu_object_free(os, obj, tx);
- dmu_tx_commit(tx);
- if (err && err != ENOENT)
- return (EINVAL);
}
return (0);
}
@@ -735,7 +975,6 @@ static int
restore_free(struct restorearg *ra, objset_t *os,
struct drr_free *drrf)
{
- dmu_tx_t *tx;
int err;
if (drrf->drr_length != -1ULL &&
@@ -745,66 +984,65 @@ restore_free(struct restorearg *ra, objset_t *os,
if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
return (EINVAL);
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_free(tx, drrf->drr_object,
+ err = dmu_free_long_range(os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_free_range(os, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length, tx);
- dmu_tx_commit(tx);
return (err);
}
+void
+dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
+{
+ if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
+ /*
+ * online incremental or new fs: destroy the fs (which
+ * may be a clone) that we created
+ */
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (drc->drc_real_ds != drc->drc_logical_ds)
+ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+ } else {
+ /*
+ * offline incremental: rollback to most recent snapshot.
+ */
+ (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
+ dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
+ }
+}
+
+/*
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
int
-dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, struct file *fp, uint64_t voffset)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
{
kthread_t *td = curthread;
- struct restorearg ra;
+ struct restorearg ra = { 0 };
dmu_replay_record_t *drr;
- char *cp;
- objset_t *os = NULL;
- zio_cksum_t pzc;
-
- bzero(&ra, sizeof (ra));
- ra.td = td;
- ra.fp = fp;
- ra.voff = voffset;
- ra.bufsize = 1<<20;
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+ objset_t *os;
+ zio_cksum_t pcksum;
- if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
- ra.byteswap = FALSE;
- } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
ra.byteswap = TRUE;
- } else {
- ra.err = EINVAL;
- goto out;
- }
- /*
- * NB: this assumes that struct drr_begin will be the largest in
- * dmu_replay_record_t's drr_u, and thus we don't need to pad it
- * with zeros to make it the same length as we wrote out.
- */
- ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
- ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
- ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
- if (ra.byteswap) {
- fletcher_4_incremental_byteswap(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
- } else {
- fletcher_4_incremental_native(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
+ {
+ /* compute checksum of drr_begin record */
+ dmu_replay_record_t *drr;
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin = *drc->drc_drrb;
+ if (ra.byteswap) {
+ fletcher_4_incremental_byteswap(drr,
+ sizeof (dmu_replay_record_t), &ra.cksum);
+ } else {
+ fletcher_4_incremental_native(drr,
+ sizeof (dmu_replay_record_t), &ra.cksum);
+ }
+ kmem_free(drr, sizeof (dmu_replay_record_t));
}
- (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
if (ra.byteswap) {
+ struct drr_begin *drrb = drc->drc_drrb;
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
drrb->drr_version = BSWAP_64(drrb->drr_version);
drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
@@ -813,94 +1051,30 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
}
- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-
- if (drrb->drr_version != DMU_BACKUP_VERSION ||
- drrb->drr_type >= DMU_OST_NUMTYPES ||
- strchr(drrb->drr_toname, '@') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- /*
- * Process the begin in syncing context.
- */
- if (drrb->drr_fromguid) {
- /* incremental backup */
- dsl_dataset_t *ds = NULL;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
- *cp = '@';
- if (ra.err)
- goto out;
-
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- drrb->drr_fromguid) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- kmem_free(ra.buf, ra.bufsize);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds);
- }
- ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- replay_incremental_check, replay_incremental_sync,
- ds, drrb, 1);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full backup */
- dsl_dir_t *dd = NULL;
- const char *tail;
-
- /* can't restore full backup into topmost fs, for now */
- if (strrchr(drrb->drr_toname, '/') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
- *cp = '@';
- if (ra.err)
- goto out;
- if (tail == NULL) {
- ra.err = EEXIST;
- goto out;
- }
+ ra.td = td;
+ ra.fp = fp;
+ ra.voff = *voffp;
+ ra.bufsize = 1<<20;
+ ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
- ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
- replay_full_sync, dd, drrb, 5);
- dsl_dir_close(dd, FTAG);
- }
- if (ra.err)
- goto out;
+ /* these were verified in dmu_recv_begin */
+ ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+ ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
/*
* Open the objset we are modifying.
*/
+ VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
- DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
- *cp = '@';
- ASSERT3U(ra.err, ==, 0);
+ ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
/*
* Read records and process them.
*/
- pzc = ra.zc;
+ pcksum = ra.cksum;
while (ra.err == 0 &&
NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
- if (SIGPENDING(td)) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
ra.err = EINTR;
goto out;
}
@@ -947,63 +1121,116 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
* value, because the stored checksum is of
* everything before the DRR_END record.
*/
- if (drre.drr_checksum.zc_word[0] != 0 &&
- !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
+ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
ra.err = ECKSUM;
- goto out;
- }
-
- ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
- ds_dir->dd_pool, replay_end_check, replay_end_sync,
- os, drrb, 3);
goto out;
}
default:
ra.err = EINVAL;
goto out;
}
- pzc = ra.zc;
+ pcksum = ra.cksum;
}
+ ASSERT(ra.err != 0);
out:
- if (os)
- dmu_objset_close(os);
+ dmu_objset_close(os);
- /*
- * Make sure we don't rollback/destroy unless we actually
- * processed the begin properly. 'os' will only be set if this
- * is the case.
- */
- if (ra.err && os && tosnap && strchr(tosnap, '@')) {
+ if (ra.err != 0) {
/*
* rollback or destroy what we created, so we don't
* leave it in the restoring state.
*/
- dsl_dataset_t *ds;
- int err;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- err = dsl_dataset_open(tosnap,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err == 0) {
- txg_wait_synced(ds->ds_dir->dd_pool, 0);
- if (drrb->drr_fromguid) {
- /* incremental: rollback to most recent snap */
- (void) dsl_dataset_rollback(ds);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full: destroy whole fs */
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- (void) dsl_dataset_destroy(tosnap);
- }
- }
- *cp = '@';
+ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
+ dmu_recv_abort_cleanup(drc);
}
kmem_free(ra.buf, ra.bufsize);
- if (sizep)
- *sizep = ra.voff;
+ *voffp = ra.voff;
return (ra.err);
}
+
+struct recvendsyncarg {
+ char *tosnap;
+ uint64_t creation_time;
+ uint64_t toguid;
+};
+
+static int
+recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvendsyncarg *resa = arg2;
+
+ return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+}
+
+static void
+recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvendsyncarg *resa = arg2;
+
+ dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
+ ds->ds_prev->ds_phys->ds_guid = resa->toguid;
+ ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+ struct recvendsyncarg resa;
+ dsl_dataset_t *ds = drc->drc_logical_ds;
+ int err;
+
+ /*
+ * XXX hack; seems the ds is still dirty and
+ * dsl_pool_zil_clean() expects it to have a ds_user_ptr
+ * (and zil), but clone_swap() can close it.
+ */
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+ if (ds != drc->drc_real_ds) {
+ /* we are doing an online recv */
+ if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+ err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+ drc->drc_force);
+ if (err)
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ } else {
+ err = EBUSY;
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ }
+ /* dsl_dataset_destroy() will disown the ds */
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (err)
+ return (err);
+ }
+
+ resa.creation_time = drc->drc_drrb->drr_creation_time;
+ resa.toguid = drc->drc_drrb->drr_toguid;
+ resa.tosnap = drc->drc_tosnap;
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_end_check, recv_end_sync, ds, &resa, 3);
+ if (err) {
+ if (drc->drc_newfs) {
+ ASSERT(ds == drc->drc_real_ds);
+ (void) dsl_dataset_destroy(ds, dmu_recv_tag);
+ return (err);
+ } else {
+ (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ }
+ }
+
+ /* release the hold from dmu_recv_begin */
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 3d2bc3e47678..43bf82e7a682 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
+#include <sys/zvol.h>
#define BP_SPAN_SHIFT(level, width) ((level) * (width))
@@ -261,6 +262,16 @@ advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
return (EAGAIN);
}
+/*
+ * The traverse_callback function will call the function specified in th_func.
+ * In the event of an error the callee, specified by th_func, must return
+ * one of the following errors:
+ *
+ * EINTR - Indicates that the callee wants the traversal to
+ * abort immediately.
+ * ERESTART - The callee has acknowledged the error and would
+ * like to continue.
+ */
static int
traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
{
@@ -603,7 +614,10 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
th->th_locked = 0;
}
- rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+ if (BP_IS_HOLE(&dsp->ds_bp))
+ rc = ERESTART;
+ else
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
if (rc != 0) {
if (rc == ERESTART)
@@ -722,6 +736,24 @@ traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
}
int
+traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
+
+ traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
traverse_more(traverse_handle_t *th)
{
zseg_t *zseg = list_head(&th->th_seglist);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index 13fd8d4d9dce..000c3ce64eb5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
@@ -157,7 +155,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL)
return (EIO);
- err = dbuf_read(db, zio, DB_RF_CANFAIL);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
dbuf_rele(db, FTAG);
return (err);
}
@@ -294,6 +292,8 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
txh->txh_space_tooverwrite += space;
} else {
txh->txh_space_towrite += space;
+ if (dn && dn->dn_dbuf->db_blkptr)
+ txh->txh_space_tounref += space;
}
}
@@ -318,39 +318,25 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
static void
dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
- uint64_t blkid, nblks;
- uint64_t space = 0;
+ uint64_t blkid, nblks, lastblk;
+ uint64_t space = 0, unref = 0, skipped = 0;
dnode_t *dn = txh->txh_dnode;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- int dirty;
+ int epbs;
- /*
- * We don't need to use any locking to check for dirtyness
- * because it's OK if we get stale data -- the dnode may become
- * dirty immediately after our check anyway. This is just a
- * means to avoid the expensive count when we aren't sure we
- * need it. We need to be able to deal with a dirty dnode.
- */
- dirty = list_link_active(&dn->dn_dirty_link[0]) |
- list_link_active(&dn->dn_dirty_link[1]) |
- list_link_active(&dn->dn_dirty_link[2]) |
- list_link_active(&dn->dn_dirty_link[3]);
- if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ if (dn->dn_nlevels == 0)
return;
/*
- * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * The struct_rwlock protects us against dn_nlevels
* changing, in case (against all odds) we manage to dirty &
* sync out the changes after we check for being dirty.
- * also, dbuf_hold_impl() wants us to have the struct_rwlock.
- *
- * It's fine to use dn_datablkshift rather than the dn_phys
- * equivalent because if it is changing, maxblkid==0 and we will
- * bail.
+ * Also, dbuf_hold_level() wants us to have the struct_rwlock.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_phys->dn_maxblkid == 0) {
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_maxblkid == 0) {
if (off == 0 && len >= dn->dn_datablksz) {
blkid = 0;
nblks = 1;
@@ -360,78 +346,120 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
} else {
blkid = off >> dn->dn_datablkshift;
- nblks = (off + len) >> dn->dn_datablkshift;
+ nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_phys->dn_maxblkid) {
+ if (blkid >= dn->dn_maxblkid) {
rw_exit(&dn->dn_struct_rwlock);
return;
}
- if (blkid + nblks > dn->dn_phys->dn_maxblkid)
- nblks = dn->dn_phys->dn_maxblkid - blkid;
+ if (blkid + nblks > dn->dn_maxblkid)
+ nblks = dn->dn_maxblkid - blkid;
- /* don't bother after 128,000 blocks */
- nblks = MIN(nblks, 128*1024);
}
-
- if (dn->dn_phys->dn_nlevels == 1) {
+ if (dn->dn_nlevels == 1) {
int i;
for (i = 0; i < nblks; i++) {
blkptr_t *bp = dn->dn_phys->dn_blkptr;
- ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ ASSERT3U(blkid + i, <, dn->dn_nblkptr);
bp += blkid + i;
if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
space += bp_get_dasize(spa, bp);
}
+ unref += BP_GET_ASIZE(bp);
}
nblks = 0;
}
+ /*
+ * Add in memory requirements of higher-level indirects.
+ * This assumes a worst-possible scenario for dn_nlevels.
+ */
+ {
+ uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
+ int level = (dn->dn_nlevels > 1) ? 2 : 1;
+
+ while (level++ < DN_MAX_LEVELS) {
+ txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
+ blkcnt = 1 + (blkcnt >> epbs);
+ }
+ ASSERT(blkcnt <= dn->dn_nblkptr);
+ }
+
+ lastblk = blkid + nblks - 1;
while (nblks) {
dmu_buf_impl_t *dbuf;
- int err, epbs, blkoff, tochk;
-
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- blkoff = P2PHASE(blkid, 1<<epbs);
- tochk = MIN((1<<epbs) - blkoff, nblks);
-
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
- if (err == 0) {
- int i;
- blkptr_t *bp;
-
- err = dbuf_read(dbuf, NULL,
- DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
- if (err != 0) {
- txh->txh_tx->tx_err = err;
- dbuf_rele(dbuf, FTAG);
- break;
- }
+ uint64_t ibyte, new_blkid;
+ int epb = 1 << epbs;
+ int err, i, blkoff, tochk;
+ blkptr_t *bp;
+
+ ibyte = blkid << dn->dn_datablkshift;
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+ new_blkid = ibyte >> dn->dn_datablkshift;
+ if (err == ESRCH) {
+ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+ break;
+ }
+ if (err) {
+ txh->txh_tx->tx_err = err;
+ break;
+ }
+ if (new_blkid > lastblk) {
+ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+ break;
+ }
- bp = dbuf->db.db_data;
- bp += blkoff;
+ if (new_blkid > blkid) {
+ ASSERT((new_blkid >> epbs) > (blkid >> epbs));
+ skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
+ nblks -= new_blkid - blkid;
+ blkid = new_blkid;
+ }
+ blkoff = P2PHASE(blkid, epb);
+ tochk = MIN(epb - blkoff, nblks);
- for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds,
- bp[i].blk_birth)) {
- dprintf_bp(&bp[i],
- "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
- }
- }
+ dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+
+ txh->txh_memory_tohold += dbuf->db.db_size;
+ if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
+ txh->txh_tx->tx_err = E2BIG;
dbuf_rele(dbuf, FTAG);
+ break;
}
- if (err && err != ENOENT) {
+ err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
break;
}
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+ dprintf_bp(&bp[i], "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ unref += BP_GET_ASIZE(bp);
+ }
+ dbuf_rele(dbuf, FTAG);
+
blkid += tochk;
nblks -= tochk;
}
rw_exit(&dn->dn_struct_rwlock);
+ /* account for new level 1 indirect blocks that might show up */
+ if (skipped > 0) {
+ txh->txh_fudge += skipped << dn->dn_indblkshift;
+ skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+ txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+ }
txh->txh_space_tofree += space;
+ txh->txh_space_tounref += unref;
}
void
@@ -466,7 +494,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
/*
* For i/o error checking, read the first and last level-0
* blocks, and all the level-1 blocks. The above count_write's
- * will take care of the level-0 blocks.
+ * have already taken care of the level-0 blocks.
*/
if (dn->dn_nlevels > 1) {
shift = dn->dn_datablkshift + dn->dn_indblkshift -
@@ -478,7 +506,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
NULL, NULL, ZIO_FLAG_CANFAIL);
for (i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
- err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
if (err == ESRCH)
break;
@@ -550,10 +578,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
* the size will change between now and the dbuf dirty call.
*/
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_phys->dn_blkptr[0].blk_birth))
+ dn->dn_phys->dn_blkptr[0].blk_birth)) {
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
- else
+ } else {
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref +=
+ BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
+ }
return;
}
@@ -575,7 +606,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
* 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
*/
dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
- (3 + add ? 3 : 0) << dn->dn_datablkshift);
+ (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
/*
* If the modified blocks are scattered to the four winds,
@@ -698,12 +729,13 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
match_offset = TRUE;
break;
case THT_FREE:
- if (blkid == beginblk &&
- (txh->txh_arg1 != 0 ||
- dn->dn_maxblkid == 0))
- match_offset = TRUE;
- if (blkid == endblk &&
- txh->txh_arg2 != DMU_OBJECT_END)
+ /*
+ * We will dirty all the level 1 blocks in
+ * the free range and perhaps the first and
+ * last level 0 block.
+ */
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
case THT_BONUS:
@@ -733,12 +765,32 @@ static int
dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
dmu_tx_hold_t *txh;
- uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+ spa_t *spa = tx->tx_pool->dp_spa;
+ uint64_t memory, asize, fsize, usize;
+ uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
ASSERT3U(tx->tx_txg, ==, 0);
+
if (tx->tx_err)
return (tx->tx_err);
+ if (spa_suspended(spa)) {
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ txg_how != TXG_WAIT)
+ return (EIO);
+
+ return (ERESTART);
+ }
+
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;
@@ -748,7 +800,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
* dmu_tx_unassign() logic.
*/
- towrite = tofree = tooverwrite = 0;
+ towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
@@ -768,6 +820,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
towrite += txh->txh_space_towrite;
tofree += txh->txh_space_tofree;
tooverwrite += txh->txh_space_tooverwrite;
+ tounref += txh->txh_space_tounref;
+ tohold += txh->txh_memory_tohold;
+ fudge += txh->txh_fudge;
}
/*
@@ -788,22 +843,31 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
tooverwrite = tofree = 0;
}
- /*
- * Convert logical size to worst-case allocated size.
- */
+ /* needed allocation: worst-case estimate of write space */
+ asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+ /* freed space estimate: worst-case overwrite + free estimate */
fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
- lsize = towrite + tooverwrite;
- asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ /* convert unrefd space to worst-case estimate */
+ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+ /* calculate memory footprint estimate */
+ memory = towrite + tooverwrite + tohold;
#ifdef ZFS_DEBUG
- tx->tx_space_towrite = asize;
+ /*
+ * Add in 'tohold' to account for our dirty holds on this memory
+ * XXX - the "fudge" factor is to account for skipped blocks that
+ * we missed because dnode_next_offset() misses in-core-only blocks.
+ */
+ tx->tx_space_towrite = asize +
+ spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
tx->tx_space_tofree = tofree;
tx->tx_space_tooverwrite = tooverwrite;
+ tx->tx_space_tounref = tounref;
#endif
if (tx->tx_dir && asize != 0) {
- int err = dsl_dir_tempreserve_space(tx->tx_dir,
- lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+ asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
if (err)
return (err);
}
@@ -885,10 +949,18 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
void
dmu_tx_wait(dmu_tx_t *tx)
{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
ASSERT(tx->tx_txg == 0);
- ASSERT(tx->tx_lasttried_txg != 0);
- if (tx->tx_needassign_txh) {
+ /*
+ * It's possible that the pool has become active after this thread
+ * has tried to obtain a tx. If that's the case then his
+ * tx_lasttried_txg would not have been assigned.
+ */
+ if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+ txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
mutex_enter(&dn->dn_mtx);
@@ -948,6 +1020,7 @@ dmu_tx_commit(dmu_tx_t *tx)
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
+ list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
@@ -975,6 +1048,7 @@ dmu_tx_abort(dmu_tx_t *tx)
if (dn != NULL)
dnode_rele(dn, tx);
}
+ list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
refcount_destroy_many(&tx->tx_space_written,
refcount_count(&tx->tx_space_written));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index b25cc898c37d..8dba38176527 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -38,10 +38,6 @@
*/
int zfs_prefetch_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
- &zfs_prefetch_disable, 0, "Disable prefetch");
/* max # of streams per zfetch */
uint32_t zfetch_max_streams = 8;
@@ -52,6 +48,25 @@ uint32_t zfetch_block_cap = 256;
/* number of bytes in a array_read at which we stop prefetching (1Mb) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+ &zfs_prefetch_disable, 0, "Disable prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN,
+ &zfetch_max_streams, 0, "Max # of streams per zfetch");
+TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
+ &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
+TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
+ &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
+TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
+SYSCTL_QUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
+ &zfetch_array_rd_sz, 0,
+ "Number of bytes in a array_read at which we stop prefetching");
+
/* forward decls for static routines */
static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index ca502857b1fa..5adbc3c0ff5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
@@ -242,6 +240,23 @@ free_range_compar(const void *node1, const void *node2)
else return (0);
}
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -285,6 +300,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
list_insert_head(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
+ arc_space_consume(sizeof (dnode_t));
return (dn);
}
@@ -319,6 +335,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_bonus = NULL;
}
kmem_cache_free(dnode_cache, dn);
+ arc_space_return(sizeof (dnode_t));
}
void
@@ -362,6 +379,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -389,6 +407,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -396,7 +415,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int i;
+ int i, old_nblkptr;
dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -413,7 +432,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -436,38 +455,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
}
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
rw_exit(&dn->dn_struct_rwlock);
- if (db) {
+ if (db)
dbuf_rele(db, FTAG);
- db = NULL;
- }
/* change type */
dn->dn_type = ot;
- if (dn->dn_bonuslen != bonuslen) {
- /* change bonus size */
- if (bonuslen == 0)
- bonuslen = 1; /* XXX */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- if (refcount_add(&db->db_holds, FTAG) == 1)
- dnode_add_ref(dn, db);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT(db->db.db_data != NULL);
- db->db.db_size = bonuslen;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- }
-
/* change bonus size and type */
mutex_enter(&dn->dn_mtx);
+ old_nblkptr = dn->dn_nblkptr;
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -475,12 +474,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- /*
- * NB: we have to do the dbuf_rele after we've changed the
- * dn_bonuslen, for the sake of dbuf_verify().
- */
- if (db)
- dbuf_rele(db, FTAG);
+ /* XXX - for now, we can't make nblkptr smaller */
+ ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+ /* fix up the bonus db_size if dn_nblkptr has changed */
+ if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+ dn->dn_bonus->db.db_size =
+ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
@@ -559,6 +561,12 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
dmu_buf_impl_t *db;
dnode_t **children_dnodes;
+ /*
+ * If you are holding the spa config lock as writer, you shouldn't
+ * be asking the DMU to do *anything*.
+ */
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
@@ -602,9 +610,10 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
}
if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
- db, object);
+
+ dn = dnode_create(os, dnp, db, object);
winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
if (winner != NULL) {
dnode_destroy(dn);
@@ -644,11 +653,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
dnode_add_ref(dnode_t *dn, void *tag)
{
- ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, tag);
+ mutex_enter(&dn->dn_mtx);
+ if (refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
}
void
@@ -656,7 +676,9 @@ dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && dn->dn_dbuf)
dbuf_rele(dn->dn_dbuf, dn);
@@ -692,6 +714,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -714,7 +737,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
* dnode will hang around after we finish processing its
* children.
*/
- dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
(void) dbuf_dirty(dn->dn_dbuf, tx);
@@ -762,7 +785,7 @@ int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
- int have_db0 = FALSE;
+ int err;
if (size == 0)
size = SPA_MINBLOCKSIZE;
@@ -787,9 +810,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid == 0) {
- have_db0 = TRUE;
- } else if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -799,12 +820,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs && dn->dn_nlevels != 1)
goto fail;
- db = NULL;
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
- /* obtain the old block */
- db = dbuf_hold(dn, 0, FTAG);
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ if (err == 0)
dbuf_new_size(db, size, tx);
- }
+ else if (err != ENOENT)
+ goto fail;
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
@@ -813,7 +834,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dn->dn_indblkshift = ibs;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
}
-
+ /* rele after we have fixed the blocksize in the dnode */
if (db)
dbuf_rele(db, FTAG);
@@ -825,19 +846,32 @@ fail:
return (ENOTSUP);
}
+/* read-holding callers must not rely on the lock being continuously held */
void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
- int drop_struct_lock = FALSE;
int epbs, new_nlevels;
uint64_t sz;
ASSERT(blkid != DB_BONUS_BLKID);
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- drop_struct_lock = TRUE;
+ ASSERT(have_read ?
+ RW_READ_HELD(&dn->dn_struct_rwlock) :
+ RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * if we have a read-lock, check to see if we need to do any work
+ * before upgrading to a write-lock.
+ */
+ if (have_read) {
+ if (blkid <= dn->dn_maxblkid)
+ return;
+
+ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ }
}
if (blkid <= dn->dn_maxblkid)
@@ -889,8 +923,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
}
out:
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
+ if (have_read)
+ rw_downgrade(&dn->dn_struct_rwlock);
}
void
@@ -951,15 +985,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
uint64_t blkoff, blkid, nblks;
- int blksz, head;
+ int blksz, blkshift, head, tail;
int trunc = FALSE;
+ int epbs;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- /* If the range is past the end of the file, this is a no-op */
- if (off >= blksz * (dn->dn_maxblkid+1))
- goto out;
if (len == -1ULL) {
len = UINT64_MAX - off;
trunc = TRUE;
@@ -971,11 +1005,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (ISP2(blksz)) {
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
- /* Freeing the whole block; don't do any head. */
- head = 0;
+ /* Freeing the whole block; fast-track this request */
+ blkid = 0;
+ nblks = 1;
+ goto done;
+ } else if (off >= blksz) {
+ /* Freeing past end-of-data */
+ goto out;
} else {
/* Freeing part of the block. */
head = blksz - off;
@@ -1008,88 +1049,95 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
/* If the range was less than one block, we're done */
- if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ if (len == 0)
goto out;
- if (!ISP2(blksz)) {
- /*
- * They are freeing the whole block of a
- * non-power-of-two blocksize file. Skip all the messy
- * math.
- */
- ASSERT3U(off, ==, 0);
- ASSERT3U(len, >=, blksz);
- blkid = 0;
- nblks = 1;
- } else {
- int tail;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- int blkshift = dn->dn_datablkshift;
-
- /* If the remaining range is past end of file, we're done */
- if (off > dn->dn_maxblkid << blkshift)
- goto out;
+ /* If the remaining range is past end of file, we're done */
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
- if (off + len == UINT64_MAX)
- tail = 0;
- else
- tail = P2PHASE(len, blksz);
-
- ASSERT3U(P2PHASE(off, blksz), ==, 0);
- /* zero out any partial block data at the end of the range */
- if (tail) {
- if (len < tail)
- tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
- /* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr &&
- !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
- rw_enter(&dn->dn_struct_rwlock,
- RW_WRITER);
- bzero(db->db.db_data, tail);
- }
- dbuf_rele(db, FTAG);
+ ASSERT(ISP2(blksz));
+ if (trunc)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, tail);
}
- len -= tail;
+ dbuf_rele(db, FTAG);
}
- /* If the range did not include a full block, we are done */
- if (len == 0)
- goto out;
+ len -= tail;
+ }
- /* dirty the left indirects */
- if (dn->dn_nlevels > 1 && off != 0) {
- db = dbuf_hold_level(dn, 1,
- (off - head) >> (blkshift + epbs), FTAG);
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ ASSERT(IS_P2ALIGNED(off, blksz));
+ ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+ if (trunc)
+ nblks += 1;
+
+ /*
+ * Read in and mark all the level-1 indirects dirty,
+ * so that they will stay in memory until syncing phase.
+ * Always dirty the first and last indirect to make sure
+ * we dirty all the partial indirects.
+ */
+ if (dn->dn_nlevels > 1) {
+ uint64_t i, first, last;
+ int shift = epbs + dn->dn_datablkshift;
+
+ first = blkid >> epbs;
+ if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
-
- /* dirty the right indirects */
- if (dn->dn_nlevels > 1 && !trunc) {
- db = dbuf_hold_level(dn, 1,
- (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ if (trunc)
+ last = dn->dn_maxblkid >> epbs;
+ else
+ last = (blkid + nblks - 1) >> epbs;
+ if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
-
- /*
- * Finally, add this range to the dnode range list, we
- * will finish up this free operation in the syncing phase.
- */
- ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
- ASSERT(off + len == UINT64_MAX ||
- IS_P2ALIGNED(len, 1<<blkshift));
- blkid = off >> blkshift;
- nblks = len >> blkshift;
-
- if (trunc)
- dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ for (i = first + 1; i < last; i++) {
+ uint64_t ibyte = i << shift;
+ int err;
+
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH || i >= last)
+ break;
+ ASSERT(err == 0);
+ db = dbuf_hold_level(dn, 1, i, FTAG);
+ if (db) {
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+ }
}
-
+done:
+ /*
+ * Add this range to the dnode range list.
+ * We will finish up this free operation in the syncing phase.
+ */
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, blkid, nblks, tx);
{
@@ -1109,9 +1157,12 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
mutex_exit(&dn->dn_mtx);
- dbuf_free_range(dn, blkid, nblks, tx);
+ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
+ if (trunc && dn->dn_maxblkid >= (off >> blkshift))
+ dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
+
rw_exit(&dn->dn_struct_rwlock);
}
@@ -1179,7 +1230,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
ASSERT3U(space, >=, -delta); /* no underflow */
}
space += delta;
- if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
@@ -1211,7 +1262,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
}
static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
@@ -1219,11 +1270,16 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t epb = 1ULL << epbs;
uint64_t minfill, maxfill;
- int i, error, span;
+ boolean_t hole;
+ int i, inc, error, span;
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+ hole = flags & DNODE_FIND_HOLE;
+ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+ ASSERT(txg == 0 || !hole);
+
if (lvl == dn->dn_phys->dn_nlevels) {
error = 0;
epb = dn->dn_phys->dn_nblkptr;
@@ -1232,9 +1288,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
if (error) {
- if (error == ENOENT)
- return (hole ? 0 : ESRCH);
- return (error);
+ if (error != ENOENT)
+ return (error);
+ if (hole)
+ return (0);
+ /*
+ * This can only happen when we are searching up
+ * the block tree for data. We don't really need to
+ * adjust the offset, as we will just end up looking
+ * at the pointer to this block in its parent, and its
+ * going to be unallocated, so we will skip over it.
+ */
+ return (ESRCH);
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
if (error) {
@@ -1246,13 +1311,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
if (db && txg &&
(db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+ /*
+ * This can only happen when we are searching up the tree
+ * and these conditions mean that we need to keep climbing.
+ */
error = ESRCH;
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
span = DNODE_SHIFT;
ASSERT(dn->dn_type == DMU_OT_DNODE);
- for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ for (i = (*offset >> span) & (blkfill - 1);
+ i >= 0 && i < blkfill; i += inc) {
boolean_t newcontents = B_TRUE;
if (txg) {
int j;
@@ -1264,9 +1334,9 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
}
if (!dnp[i].dn_type == hole && newcontents)
break;
- *offset += 1ULL << span;
+ *offset += (1ULL << span) * inc;
}
- if (i == blkfill)
+ if (i < 0 || i == blkfill)
error = ESRCH;
} else {
blkptr_t *bp = data;
@@ -1280,14 +1350,17 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
minfill++;
for (i = (*offset >> span) & ((1ULL << epbs) - 1);
- i < epb; i++) {
+ i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill &&
bp[i].blk_fill <= maxfill &&
- bp[i].blk_birth > txg)
+ (hole || bp[i].blk_birth > txg))
break;
- *offset += 1ULL << span;
+ if (inc < 0 && *offset < (1ULL << span))
+ *offset = 0;
+ else
+ *offset += (1ULL << span) * inc;
}
- if (i >= epb)
+ if (i < 0 || i == epb)
error = ESRCH;
}
@@ -1306,64 +1379,66 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
*
* Examples:
*
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- * Finds the next hole/data in a file.
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ * Finds the next/previous hole/data in a file.
* Used in dmu_offset_next().
*
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
* Finds the next free/allocated dnode an objset's meta-dnode.
* Only finds objects that have new contents since txg (ie.
* bonus buffer changes and content removal are ignored).
* Used in dmu_object_next().
*
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
* Used in dmu_object_alloc().
*/
int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
+ uint64_t initial_offset = *offset;
int lvl, maxlvl;
int error = 0;
- uint64_t initial_offset = *offset;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_phys->dn_nlevels == 0) {
- rw_exit(&dn->dn_struct_rwlock);
- return (ESRCH);
+ error = ESRCH;
+ goto out;
}
if (dn->dn_datablkshift == 0) {
if (*offset < dn->dn_datablksz) {
- if (hole)
+ if (flags & DNODE_FIND_HOLE)
*offset = dn->dn_datablksz;
} else {
error = ESRCH;
}
- rw_exit(&dn->dn_struct_rwlock);
- return (error);
+ goto out;
}
maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
if (error != ESRCH)
break;
}
- while (--lvl >= minlvl && error == 0) {
+ while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
}
- rw_exit(&dn->dn_struct_rwlock);
-
- if (error == 0 && initial_offset > *offset)
+ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+ initial_offset < *offset : initial_offset > *offset))
error = ESRCH;
+out:
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_exit(&dn->dn_struct_rwlock);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 9e8c7adbda01..a46d4e70abc8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,9 +55,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
ASSERT(db != NULL);
dn->dn_phys->dn_nlevels = new_level;
- dprintf("os=%p obj=%llu, increase to %d\n",
- dn->dn_objset, dn->dn_object,
- dn->dn_phys->dn_nlevels);
+ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+ dn->dn_object, dn->dn_phys->dn_nlevels);
/* check for existing blkptrs in the dnode */
for (i = 0; i < nblkptr; i++)
@@ -110,25 +109,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
-static void
+static int
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
- int i;
+ int i, blocks_freed = 0;
- dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
for (i = 0; i < num; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += bp_get_dasize(os->os_spa, bp);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
- dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
bzero(bp, sizeof (blkptr_t));
+ blocks_freed += 1;
}
dnode_diduse_space(dn, -bytesfreed);
+ return (blocks_freed);
}
#ifdef ZFS_DEBUG
@@ -160,7 +160,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
- (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
rw_exit(&db->db_dnode->dn_struct_rwlock);
if (err == ENOENT)
continue;
@@ -178,7 +178,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
if (buf[j] != 0) {
panic("freed data not zero: "
"child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
+ (void *)child, i, off, num);
}
}
}
@@ -195,7 +195,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
if (buf[j] != 0) {
panic("freed data not zero: "
"child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
+ (void *)child, i, off, num);
}
}
}
@@ -206,6 +206,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
+#define ALL -1
+
static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
@@ -216,8 +218,18 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
int all = TRUE;
+ int blocks_freed = 0;
+
+ /*
+ * There is a small possibility that this block will not be cached:
+ * 1 - if level > 1 and there are no children with level <= 1
+ * 2 - if we didn't get a dirty hold (because this block had just
+ * finished being written -- and so had no holds), and then this
+ * block got evicted before we got here.
+ */
+ if (db->db_state != DB_CACHED)
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
arc_release(db->db_buf, db);
bp = (blkptr_t *)db->db.db_data;
@@ -241,10 +253,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
+ blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
for (i = start; i <= end; i++, bp++) {
@@ -255,9 +267,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(subdb->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ blocks_freed += free_blocks(dn, bp, 1, tx);
} else {
all = FALSE;
}
@@ -274,8 +286,8 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
ASSERT3U(bp->blk_birth, ==, 0);
}
#endif
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
/*
@@ -305,15 +317,14 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
- free_blocks(dn, bp + blkid, nblks, tx);
+ (void) free_blocks(dn, bp + blkid, nblks, tx);
if (trunc) {
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off,
- 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
return;
}
@@ -331,9 +342,9 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(db, blkid, nblks, trunc, tx)) {
+ if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(db->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ (void) free_blocks(dn, bp, 1, tx);
}
dbuf_rele(db, FTAG);
}
@@ -343,15 +354,15 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
}
/*
* Try to kick all the dnodes dbufs out of the cache...
*/
-int
-dnode_evict_dbufs(dnode_t *dn, int try)
+void
+dnode_evict_dbufs(dnode_t *dn)
{
int progress;
int pass = 0;
@@ -367,6 +378,7 @@ dnode_evict_dbufs(dnode_t *dn, int try)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
+ ASSERT3P(db->db_dnode, ==, dn);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
@@ -375,7 +387,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
mutex_exit(&db->db_mtx);
} else if (refcount_is_zero(&db->db_holds)) {
progress = TRUE;
- ASSERT(!arc_released(db->db_buf));
dbuf_clear(db); /* exits db_mtx for us */
} else {
mutex_exit(&db->db_mtx);
@@ -397,21 +408,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
ASSERT(pass < 100); /* sanity check */
} while (progress);
- /*
- * This function works fine even if it can't evict everything.
- * If were only asked to try to evict everything then
- * return an error if we can't. Otherwise panic as the caller
- * expects total eviction.
- */
- if (list_head(&dn->dn_dbufs) != NULL) {
- if (try) {
- return (1);
- } else {
- panic("dangling dbufs (dn=%p, dbuf=%p)\n",
- dn, list_head(&dn->dn_dbufs));
- }
- }
-
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +415,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
dn->dn_bonus = NULL;
}
rw_exit(&dn->dn_struct_rwlock);
- return (0);
}
static void
@@ -460,8 +455,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
+ /*
+ * Our contents should have been freed in dnode_sync() by the
+ * free range record inserted by the caller of dnode_free().
+ */
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -479,10 +481,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_indblkshift[txgoff] = 0;
dn->dn_next_blksz[txgoff] = 0;
- /* free up all the blocks in the file. */
- dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
- ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
/* ASSERT(blkptrs are zero); */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -496,6 +494,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_type = DMU_OT_NONE;
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -558,7 +557,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- list_head(list) != NULL ||
+ dn->dn_maxblkid == 0 || list_head(list) != NULL ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -566,6 +565,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_blksz[txgoff] = 0;
}
+ if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -583,20 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
/* process all the "freed" ranges in the file */
- if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
- for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
- rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
- dnode_sync_free_range(dn,
- rp->fr_blkid, rp->fr_nblks, tx);
+ while (rp = avl_last(&dn->dn_ranges[txgoff])) {
+ dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
+ /* grab the mutex so we don't race with dnode_block_freed() */
+ mutex_enter(&dn->dn_mtx);
+ avl_remove(&dn->dn_ranges[txgoff], rp);
+ mutex_exit(&dn->dn_mtx);
+ kmem_free(rp, sizeof (free_range_t));
}
- mutex_enter(&dn->dn_mtx);
- for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
- free_range_t *last = rp;
- rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
- avl_remove(&dn->dn_ranges[txgoff], last);
- kmem_free(last, sizeof (free_range_t));
- }
- mutex_exit(&dn->dn_mtx);
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
dnode_sync_free(dn, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 7d4689f3352a..20d8ec85cc91 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
@@ -38,35 +36,44 @@
#include <sys/unique.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/zfs_znode.h>
+#include <sys/sunddi.h>
+
+static char *dsl_reaper = "the grim reaper";
static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
static dsl_checkfunc_t dsl_dataset_rollback_check;
static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
+static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper)
+
+
/*
- * We use weighted reference counts to express the various forms of exclusion
- * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
- * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
- * This makes the exclusion logic simple: the total refcnt for all opens cannot
- * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
- * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
- * just over half of the refcnt space, so there can't be more than one, but it
- * can peacefully coexist with any number of STANDARD opens.
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer. If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
*/
-static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
- 0, /* DS_MODE_NONE - invalid */
- 1, /* DS_MODE_STANDARD - unlimited number */
- (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
- DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
-};
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+ uint64_t old_bytes, new_bytes;
+ if (ds->ds_reserved == 0)
+ return (delta);
+
+ old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+ return (new_bytes - old_bytes);
+}
void
dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
@@ -74,6 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
+ int64_t delta;
dprintf_bp(bp, "born, ds=%p\n", ds);
@@ -89,23 +97,28 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
* dsl_dir.
*/
ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
used, compressed, uncompressed, tx);
dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
return;
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
+ delta = parent_delta(ds, used);
ds->ds_phys->ds_used_bytes += used;
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- used, compressed, uncompressed, tx);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+ compressed, uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
}
-void
+int
dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx)
{
@@ -113,10 +126,11 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
+ ASSERT(pio != NULL);
ASSERT(dmu_tx_is_syncing(tx));
/* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
- return;
+ return (0);
ASSERT(used > 0);
if (ds == NULL) {
@@ -125,51 +139,59 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
* Account for the meta-objset space in its placeholder
* dataset.
*/
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ err = dsl_free(pio, tx->tx_pool,
+ tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
ASSERT(err == 0);
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-used, -compressed, -uncompressed, tx);
dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
- return;
+ return (used);
}
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+ ASSERT(!dsl_dataset_is_snapshot(ds));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
int err;
+ int64_t delta;
dprintf_bp(bp, "freeing: %s", "");
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ err = dsl_free(pio, tx->tx_pool,
+ tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
ASSERT(err == 0);
+ mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- /* XXX unique_bytes is not accurate for head datasets */
- /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+ ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+ !DS_UNIQUE_IS_ACCURATE(ds));
+ delta = parent_delta(ds, -used);
ds->ds_phys->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ delta, -compressed, -uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
- ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
- ds->ds_object && bp->blk_birth >
- ds->ds_prev->ds_phys->ds_prev_snap_txg) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- mutex_enter(&ds->ds_prev->ds_lock);
- ds->ds_prev->ds_phys->ds_unique_bytes +=
- used;
- mutex_exit(&ds->ds_prev->ds_lock);
- }
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ if (bp->blk_birth > ds->ds_origin_txg) {
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
}
}
mutex_enter(&ds->ds_lock);
@@ -180,6 +202,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
mutex_exit(&ds->ds_lock);
+
+ return (used);
}
uint64_t
@@ -216,32 +240,38 @@ static void
dsl_dataset_evict(dmu_buf_t *db, void *dsv)
{
dsl_dataset_t *ds = dsv;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- /* open_refcount == DS_REF_MAX when deleting */
- ASSERT(ds->ds_open_refcount == 0 ||
- ds->ds_open_refcount == DS_REF_MAX);
+ ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
dprintf_ds(ds, "evicting %s\n", "");
- unique_remove(ds->ds_phys->ds_fsid_guid);
+ unique_remove(ds->ds_fsid_guid);
if (ds->ds_user_ptr != NULL)
ds->ds_user_evict_func(ds, ds->ds_user_ptr);
if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ dsl_dataset_drop_ref(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
bplist_close(&ds->ds_deadlist);
- dsl_dir_close(ds->ds_dir, ds);
+ if (ds->ds_dir)
+ dsl_dir_close(ds->ds_dir, ds);
- if (list_link_active(&ds->ds_synced_link))
- list_remove(&dp->dp_synced_objsets, ds);
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+ if (mutex_owned(&ds->ds_lock))
+ mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
+ if (mutex_owned(&ds->ds_opening_lock))
+ mutex_exit(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ if (mutex_owned(&ds->ds_deadlist.bpl_lock))
+ mutex_exit(&ds->ds_deadlist.bpl_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
}
@@ -266,16 +296,54 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
return (err);
headphys = headdbuf->db_data;
err = zap_value_search(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
dmu_buf_rele(headdbuf, FTAG);
return (err);
}
-int
-dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
- int mode, void *tag, dsl_dataset_t **dsp)
+static int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ matchtype_t mt;
+ int err;
+
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+ value, mt, NULL, 0, NULL);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_lookup(mos, snapobj, name, 8, 1, value);
+ return (err);
+}
+
+static int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ matchtype_t mt;
+ int err;
+
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_remove_norm(mos, snapobj, name, mt, tx);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_remove(mos, snapobj, name, tx);
+ return (err);
+}
+
+static int
+dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
@@ -297,8 +365,11 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
ds->ds_phys = dbuf->db_data;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
NULL);
+ rw_init(&ds->ds_rwlock, 0, 0, 0);
+ cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
err = bplist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
@@ -312,42 +383,65 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
* just opened it.
*/
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
}
- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ if (!dsl_dataset_is_snapshot(ds)) {
ds->ds_snapname[0] = '\0';
if (ds->ds_phys->ds_prev_snap_obj) {
- err = dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds, &ds->ds_prev);
+ err = dsl_dataset_get_ref(dp,
+ ds->ds_phys->ds_prev_snap_obj,
+ ds, &ds->ds_prev);
}
- } else {
- if (snapname) {
-#ifdef ZFS_DEBUG
- dsl_dataset_phys_t *headphys;
- dmu_buf_t *headdbuf;
- err = dmu_bonus_hold(mos,
- ds->ds_dir->dd_phys->dd_head_dataset_obj,
- FTAG, &headdbuf);
+
+ if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dataset_t *origin;
+
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj,
+ FTAG, &origin);
if (err == 0) {
- headphys = headdbuf->db_data;
- uint64_t foundobj;
- err = zap_lookup(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj,
- snapname, sizeof (foundobj), 1,
- &foundobj);
- ASSERT3U(foundobj, ==, dsobj);
- dmu_buf_rele(headdbuf, FTAG);
+ ds->ds_origin_txg =
+ origin->ds_phys->ds_creation_txg;
+ dsl_dataset_rele(origin, FTAG);
}
-#endif
- (void) strcat(ds->ds_snapname, snapname);
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
}
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ err = dsl_dataset_get_snapname(ds);
+ }
+
+ if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+ /*
+ * In sync context, we're called with either no lock
+ * or with the write lock. If we're not syncing,
+ * we're always called with the read lock held.
+ */
+ boolean_t need_lock =
+ !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
+ dsl_pool_sync_context(dp);
+
+ if (need_lock)
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_ds(ds,
+ "refreservation", sizeof (uint64_t), 1,
+ &ds->ds_reserved, NULL);
+ if (err == 0) {
+ err = dsl_prop_get_ds(ds,
+ "refquota", sizeof (uint64_t), 1,
+ &ds->ds_quota, NULL);
+ }
+
+ if (need_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ } else {
+ ds->ds_reserved = ds->ds_quota = 0;
}
if (err == 0) {
@@ -356,13 +450,14 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
}
if (err || winner) {
bplist_close(&ds->ds_deadlist);
- if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev,
- DS_MODE_NONE, ds);
- }
+ if (ds->ds_prev)
+ dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err) {
dmu_buf_rele(dbuf, tag);
@@ -370,101 +465,175 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
}
ds = winner;
} else {
- uint64_t new =
+ ds->ds_fsid_guid =
unique_insert(ds->ds_phys->ds_fsid_guid);
- if (new != ds->ds_phys->ds_fsid_guid) {
- /* XXX it won't necessarily be synced... */
- ds->ds_phys->ds_fsid_guid = new;
- }
}
}
ASSERT3P(ds->ds_dbuf, ==, dbuf);
ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+ spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+ dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
mutex_enter(&ds->ds_lock);
- if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
- (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
- !DS_MODE_IS_INCONSISTENT(mode)) ||
- (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+ if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
mutex_exit(&ds->ds_lock);
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- return (EBUSY);
+ dmu_buf_rele(ds->ds_dbuf, tag);
+ return (ENOENT);
}
- ds->ds_open_refcount += weight;
mutex_exit(&ds->ds_lock);
-
*dsp = ds;
return (0);
}
+static int
+dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /*
+ * In syncing context we don't want the rwlock lock: there
+ * may be an existing writer waiting for sync phase to
+ * finish. We don't need to worry about such writers, since
+ * sync phase is single-threaded, so the writer can't be
+ * doing anything while we are active.
+ */
+ if (dsl_pool_sync_context(dp)) {
+ ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+ return (0);
+ }
+
+ /*
+ * Normal users will hold the ds_rwlock as a READER until they
+ * are finished (i.e., call dsl_dataset_rele()). "Owners" will
+ * drop their READER lock after they set the ds_owner field.
+ *
+ * If the dataset is being destroyed, the destroy thread will
+ * obtain a WRITER lock for exclusive access after it's done its
+ * open-context work and then change the ds_owner to
+ * dsl_reaper once destruction is assured. So threads
+ * may block here temporarily, until the "destructability" of
+ * the dataset is determined.
+ */
+ ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
+ mutex_enter(&ds->ds_lock);
+ while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
+ rw_exit(&dp->dp_config_rwlock);
+ cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
+ if (DSL_DATASET_IS_DESTROYED(ds)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_drop_ref(ds, tag);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ return (ENOENT);
+ }
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ }
+ mutex_exit(&ds->ds_lock);
+ return (0);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
+
+ if (err)
+ return (err);
+ return (dsl_dataset_hold_ref(*dsp, tag));
+}
+
int
-dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
- void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
+ dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
+
+ ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
+
+ if (err)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EBUSY);
+ }
+ return (0);
+}
+
+int
+dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
dsl_pool_t *dp;
- const char *tail;
+ const char *snapname;
uint64_t obj;
- dsl_dataset_t *ds = NULL;
int err = 0;
- err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
if (err)
return (err);
dp = dd->dd_pool;
obj = dd->dd_phys->dd_head_dataset_obj;
rw_enter(&dp->dp_config_rwlock, RW_READER);
- if (obj == 0) {
- /* A dataset with no associated objset */
+ if (obj)
+ err = dsl_dataset_get_ref(dp, obj, tag, dsp);
+ else
err = ENOENT;
+ if (err)
goto out;
- }
- if (tail != NULL) {
- objset_t *mos = dp->dp_meta_objset;
+ err = dsl_dataset_hold_ref(*dsp, tag);
- err = dsl_dataset_open_obj(dp, obj, NULL,
- DS_MODE_NONE, tag, &ds);
- if (err)
- goto out;
- obj = ds->ds_phys->ds_snapnames_zapobj;
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- ds = NULL;
+ /* we may be looking for a snapshot */
+ if (err == 0 && snapname != NULL) {
+ dsl_dataset_t *ds = NULL;
- if (tail[0] != '@') {
+ if (*snapname++ != '@') {
+ dsl_dataset_rele(*dsp, tag);
err = ENOENT;
goto out;
}
- tail++;
- /* Look for a snapshot */
- if (!DS_MODE_IS_READONLY(mode)) {
- err = EROFS;
- goto out;
+ dprintf("looking for snapshot '%s'\n", snapname);
+ err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+ if (err == 0)
+ err = dsl_dataset_get_ref(dp, obj, tag, &ds);
+ dsl_dataset_rele(*dsp, tag);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+
+ if (ds) {
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_snapname[0] == 0)
+ (void) strlcpy(ds->ds_snapname, snapname,
+ sizeof (ds->ds_snapname));
+ mutex_exit(&ds->ds_lock);
+ err = dsl_dataset_hold_ref(ds, tag);
+ *dsp = err ? NULL : ds;
}
- dprintf("looking for snapshot '%s'\n", tail);
- err = zap_lookup(mos, obj, tail, 8, 1, &obj);
- if (err)
- goto out;
}
- err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
-
out:
rw_exit(&dp->dp_config_rwlock);
dsl_dir_close(dd, FTAG);
-
- ASSERT3U((err == 0), ==, (ds != NULL));
- /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
-
- *dsp = ds;
return (err);
}
int
-dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
{
- return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+ int err = dsl_dataset_hold(name, owner, dsp);
+ if (err)
+ return (err);
+ if ((*dsp)->ds_phys->ds_num_children > 0 &&
+ !DS_MODE_IS_READONLY(flags)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EROFS);
+ }
+ if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EBUSY);
+ }
+ return (0);
}
void
@@ -477,11 +646,11 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
VERIFY(0 == dsl_dataset_get_snapname(ds));
if (ds->ds_snapname[0]) {
(void) strcat(name, "@");
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
if (!MUTEX_HELD(&ds->ds_lock)) {
- /*
- * We use a "recursive" mutex so that we
- * can call dprintf_ds() with ds_lock held.
- */
mutex_enter(&ds->ds_lock);
(void) strcat(name, ds->ds_snapname);
mutex_exit(&ds->ds_lock);
@@ -505,7 +674,6 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
if (ds->ds_snapname[0]) {
++result; /* adding one for the @-sign */
if (!MUTEX_HELD(&ds->ds_lock)) {
- /* see dsl_datset_name */
mutex_enter(&ds->ds_lock);
result += strlen(ds->ds_snapname);
mutex_exit(&ds->ds_lock);
@@ -519,119 +687,160 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}
void
-dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
+ rw_exit(&ds->ds_rwlock);
+ }
+ dsl_dataset_drop_ref(ds, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+{
+ ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+ (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+
mutex_enter(&ds->ds_lock);
- ASSERT3U(ds->ds_open_refcount, >=, weight);
- ds->ds_open_refcount -= weight;
- dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
- mode, ds->ds_open_refcount);
+ ds->ds_owner = NULL;
+ if (RW_WRITE_HELD(&ds->ds_rwlock)) {
+ rw_exit(&ds->ds_rwlock);
+ cv_broadcast(&ds->ds_exclusive_cv);
+ }
mutex_exit(&ds->ds_lock);
+ if (ds->ds_dbuf)
+ dsl_dataset_drop_ref(ds, owner);
+ else
+ dsl_dataset_evict(ds->ds_dbuf, ds);
+}
- dmu_buf_rele(ds->ds_dbuf, tag);
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+{
+ boolean_t gotit = FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_owner == NULL &&
+ (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+ ds->ds_owner = owner;
+ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
+ rw_exit(&ds->ds_rwlock);
+ gotit = TRUE;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (gotit);
}
void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
{
- objset_t *mos = dp->dp_meta_objset;
+ ASSERT3P(owner, ==, ds->ds_owner);
+ if (!RW_WRITE_HELD(&ds->ds_rwlock))
+ rw_enter(&ds->ds_rwlock, RW_WRITER);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
- dsl_dataset_t *ds;
uint64_t dsobj;
- dsl_dir_t *dd;
+ objset_t *mos = dp->dp_meta_objset;
- dsl_dir_create_root(mos, ddobjp, tx);
- VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+ if (origin == NULL)
+ origin = dp->dp_origin_snap;
+
+ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+ ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_flags = flags;
dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
dsphys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+
+ if (origin) {
+ dsphys->ds_prev_snap_obj = origin->ds_object;
+ dsphys->ds_prev_snap_txg =
+ origin->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ origin->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ origin->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ origin->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = origin->ds_phys->ds_bp;
+ dsphys->ds_flags |= origin->ds_phys->ds_flags;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ origin->ds_phys->ds_num_children++;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+ if (origin->ds_phys->ds_next_clones_obj == 0) {
+ origin->ds_phys->ds_next_clones_obj =
+ zap_create(mos,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY(0 == zap_add_int(mos,
+ origin->ds_phys->ds_next_clones_obj,
+ dsobj, tx));
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_origin_obj = origin->ds_object;
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_head_dataset_obj = dsobj;
- dsl_dir_close(dd, FTAG);
- VERIFY(0 ==
- dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
- (void) dmu_objset_create_impl(dp->dp_spa, ds,
- &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (dsobj);
}
uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
- const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
- dmu_buf_t *dbuf;
- dsl_dataset_phys_t *dsphys;
uint64_t dsobj, ddobj;
- objset_t *mos = dp->dp_meta_objset;
dsl_dir_t *dd;
- ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
- ASSERT(clone_parent == NULL ||
- clone_parent->ds_phys->ds_num_children > 0);
ASSERT(lastname[0] != '@');
- ASSERT(dmu_tx_is_syncing(tx));
- ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+ ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
- dsphys->ds_dir_obj = dd->dd_object;
- dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
- sizeof (dsphys->ds_guid));
- dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
- dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- if (clone_parent) {
- dsphys->ds_prev_snap_obj = clone_parent->ds_object;
- dsphys->ds_prev_snap_txg =
- clone_parent->ds_phys->ds_creation_txg;
- dsphys->ds_used_bytes =
- clone_parent->ds_phys->ds_used_bytes;
- dsphys->ds_compressed_bytes =
- clone_parent->ds_phys->ds_compressed_bytes;
- dsphys->ds_uncompressed_bytes =
- clone_parent->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+ dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
- dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
- clone_parent->ds_phys->ds_num_children++;
+ dsl_deleg_set_create_perms(dd, tx, cr);
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
- }
- dmu_buf_rele(dbuf, FTAG);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_head_dataset_obj = dsobj;
dsl_dir_close(dd, FTAG);
return (dsobj);
@@ -653,21 +862,24 @@ dsl_snapshot_destroy_one(char *name, void *arg)
(void) strcat(name, "@");
(void) strcat(name, da->snapname);
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
da->dstg, &ds);
cp = strchr(name, '@');
*cp = '\0';
- if (err == ENOENT)
- return (0);
- if (err) {
+ if (err == 0) {
+ dsl_dataset_make_exclusive(ds, da->dstg);
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+ dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ } else if (err == ENOENT) {
+ err = 0;
+ } else {
(void) strcpy(da->failed, name);
- return (err);
}
-
- dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
- return (0);
+ return (err);
}
/*
@@ -681,16 +893,8 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
struct destroyarg da;
dsl_sync_task_t *dst;
spa_t *spa;
- char *cp;
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
+ err = spa_open(fsname, &spa, FTAG);
if (err)
return (err);
da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
@@ -706,17 +910,14 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
for (dst = list_head(&da.dstg->dstg_tasks); dst;
dst = list_next(&da.dstg->dstg_tasks, dst)) {
dsl_dataset_t *ds = dst->dst_arg1;
+ /*
+ * Return the file system name that triggered the error
+ */
if (dst->dst_err) {
dsl_dataset_name(ds, fsname);
- cp = strchr(fsname, '@');
- *cp = '\0';
+ *strchr(fsname, '@') = '\0';
}
- /*
- * If it was successful, destroy_sync would have
- * closed the ds
- */
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
+ dsl_dataset_disown(ds, da.dstg);
}
dsl_sync_task_group_destroy(da.dstg);
@@ -724,36 +925,33 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
return (err);
}
+/*
+ * ds must be opened as OWNER. On return (whether successful or not),
+ * ds will be closed and caller can no longer dereference it.
+ */
int
-dsl_dataset_destroy(const char *name)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
{
int err;
dsl_sync_task_group_t *dstg;
objset_t *os;
- dsl_dataset_t *ds;
dsl_dir_t *dd;
uint64_t obj;
- if (strchr(name, '@')) {
+ if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err)
- return (err);
+ dsl_dataset_make_exclusive(ds, tag);
+
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, FTAG, 0);
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
+ ds, tag, 0);
+ goto out;
}
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err)
- return (err);
- ds = os->os->os_dsl_dataset;
dd = ds->ds_dir;
/*
@@ -762,10 +960,12 @@ dsl_dataset_destroy(const char *name)
*/
err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
dsl_dataset_destroy_begin_sync, ds, NULL, 0);
- if (err) {
- dmu_objset_close(os);
- return (err);
- }
+ if (err)
+ goto out;
+
+ err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ if (err)
+ goto out;
/*
* remove the objects in open context, so that we won't
@@ -773,66 +973,73 @@ dsl_dataset_destroy(const char *name)
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
ds->ds_phys->ds_prev_snap_txg)) {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- /*
- * Perhaps there is not enough disk
- * space. Just deal with it from
- * dsl_dataset_destroy_sync().
- */
- dmu_tx_abort(tx);
- continue;
- }
- VERIFY(0 == dmu_object_free(os, obj, tx));
- dmu_tx_commit(tx);
+ /*
+ * Ignore errors, if there is not enough disk space
+ * we will deal with it in dsl_dataset_destroy_sync().
+ */
+ (void) dmu_free_object(os, obj);
}
- /* Make sure it's not dirty before we finish destroying it. */
- txg_wait_synced(dd->dd_pool, 0);
dmu_objset_close(os);
if (err != ESRCH)
- return (err);
+ goto out;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
if (err)
- return (err);
+ goto out;
- err = dsl_dir_open(name, FTAG, &dd, NULL);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
+ if (ds->ds_user_ptr) {
+ /*
+ * We need to sync out all in-flight IO before we try
+ * to evict (the dataset evict func is trying to clear
+ * the cached entries for this dataset in the ARC).
+ */
+ txg_wait_synced(dd->dd_pool, 0);
}
/*
* Blow away the dsl_dir + head dataset.
*/
+ dsl_dataset_make_exclusive(ds, tag);
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, FTAG, 0);
+ dsl_dataset_destroy_sync, ds, tag, 0);
dsl_sync_task_create(dstg, dsl_dir_destroy_check,
dsl_dir_destroy_sync, dd, FTAG, 0);
err = dsl_sync_task_group_wait(dstg);
dsl_sync_task_group_destroy(dstg);
- /* if it is successful, *destroy_sync will close the ds+dd */
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ /* if it is successful, dsl_dir_destroy_sync will close the dd */
+ if (err)
dsl_dir_close(dd, FTAG);
- }
+out:
+ dsl_dataset_disown(ds, tag);
return (err);
}
int
-dsl_dataset_rollback(dsl_dataset_t *ds)
+dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
{
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
- return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+ int err;
+
+ ASSERT(ds->ds_owner);
+
+ dsl_dataset_make_exclusive(ds, ds->ds_owner);
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
- ds, NULL, 0));
+ ds, &ost, 0);
+ /* drop exclusive access */
+ mutex_enter(&ds->ds_lock);
+ rw_exit(&ds->ds_rwlock);
+ cv_broadcast(&ds->ds_exclusive_cv);
+ mutex_exit(&ds->ds_lock);
+ return (err);
}
void *
@@ -904,14 +1111,56 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
}
}
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use. To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+static void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+ uint64_t mrs_used;
+ uint64_t dlused, dlcomp, dluncomp;
+
+ ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0)
+ mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+ else
+ mrs_used = 0;
+
+ VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
+ &dluncomp));
+
+ ASSERT3U(dlused, <=, mrs_used);
+ ds->ds_phys->ds_unique_bytes =
+ ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+
+ if (!DS_UNIQUE_IS_ACCURATE(ds) &&
+ spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+static uint64_t
+dsl_dataset_unique(dsl_dataset_t *ds)
+{
+ if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+
+ return (ds->ds_phys->ds_unique_bytes);
+}
+
struct killarg {
- uint64_t *usedp;
- uint64_t *compressedp;
- uint64_t *uncompressedp;
+ dsl_dataset_t *ds;
zio_t *zio;
dmu_tx_t *tx;
};
+/* ARGSUSED */
static int
kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
{
@@ -920,16 +1169,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
ASSERT3U(bc->bc_errno, ==, 0);
- /*
- * Since this callback is not called concurrently, no lock is
- * needed on the accounting values.
- */
- *ka->usedp += bp_get_dasize(spa, bp);
- *ka->compressedp += BP_GET_PSIZE(bp);
- *ka->uncompressedp += BP_GET_UCSIZE(bp);
- /* XXX check for EIO? */
- (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
- ARC_NOWAIT);
+ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+ (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+
return (0);
}
@@ -938,14 +1180,12 @@ static int
dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
/*
- * There must be a previous snapshot. I suppose we could roll
- * it back to being empty (and re-initialize the upper (ZPL)
- * layer). But for now there's no way to do this via the user
- * interface.
+ * We can only roll back to emptyness if it is a ZPL objset.
*/
- if (ds->ds_phys->ds_prev_snap_txg == 0)
+ if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
return (EINVAL);
/*
@@ -966,13 +1206,44 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ /*
+ * Before the roll back destroy the zil.
+ */
+ if (ds->ds_user_ptr != NULL) {
+ zil_rollback_destroy(
+ ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
+
+ /*
+ * We need to make sure that the objset_impl_t is reopened after
+ * we do the rollback, otherwise it will have the wrong
+ * objset_phys_t. Normally this would happen when this
+ * dataset-open is closed, thus causing the
+ * dataset to be immediately evicted. But when doing "zfs recv
+ * -F", we reopen the objset before that, so that there is no
+ * window where the dataset is closed and inconsistent.
+ */
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+
+ /* Transfer space that was freed since last snap back to the head. */
+ {
+ uint64_t used;
+
+ VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
+ ds->ds_origin_txg, UINT64_MAX, &used));
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_SNAP, DD_USED_HEAD, tx);
+ }
+
/* Zero out the deadlist. */
bplist_close(&ds->ds_deadlist);
bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
@@ -984,39 +1255,65 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
/* Free blkptrs that we gave birth to */
zio_t *zio;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
struct killarg ka;
zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
ZIO_FLAG_MUSTSUCCEED);
- ka.usedp = &used;
- ka.compressedp = &compressed;
- ka.uncompressedp = &uncompressed;
+ ka.ds = ds;
ka.zio = zio;
ka.tx = tx;
(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
ADVANCE_POST, kill_blkptr, &ka);
(void) zio_wait(zio);
-
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
}
- /* Change our contents to that of the prev snapshot */
- ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
- ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
- ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
- ds->ds_phys->ds_compressed_bytes =
- ds->ds_prev->ds_phys->ds_compressed_bytes;
- ds->ds_phys->ds_uncompressed_bytes =
- ds->ds_prev->ds_phys->ds_uncompressed_bytes;
- ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
- ds->ds_phys->ds_unique_bytes = 0;
+ ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
+ ds->ds_phys->ds_unique_bytes == 0);
+
+ if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
+ /* Change our contents to that of the prev snapshot */
+
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
+ ds->ds_prev->ds_phys->ds_used_bytes);
+
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes =
+ ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ }
+ } else {
+ objset_impl_t *osi;
+
+ ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
+ ds->ds_phys->ds_flags = 0;
+ ds->ds_phys->ds_unique_bytes = 0;
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+ &ds->ds_phys->ds_bp, *ost, tx);
+#ifdef _KERNEL
+ zfs_create_fs(&osi->os, kcred, NULL, tx);
+#endif
}
+
+ spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", ds->ds_object);
}
/* ARGSUSED */
@@ -1024,6 +1321,9 @@ static int
dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t count;
+ int err;
/*
* Can't delete a head dataset if there are snapshots of it.
@@ -1034,26 +1334,44 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
return (EINVAL);
+ /*
+ * This is really a dsl_dir thing, but check it here so that
+ * we'll be less likely to leave this dataset inconsistent &
+ * nearly destroyed.
+ */
+ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
return (0);
}
/* ARGSUSED */
static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
/* Mark it as inconsistent on-disk, in case we crash */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
}
/* ARGSUSED */
-static int
+int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ /* we have an owner hold, so noone else can destroy us */
+ ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+
/* Can't delete a branch point. */
if (ds->ds_phys->ds_num_children > 1)
return (EEXIST);
@@ -1078,11 +1396,50 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
+struct refsarg {
+ kmutex_t lock;
+ boolean_t gone;
+ kcondvar_t cv;
+};
+
+/* ARGSUSED */
+static void
+dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
+{
+ struct refsarg *arg = argv;
+
+ mutex_enter(&arg->lock);
+ arg->gone = TRUE;
+ cv_signal(&arg->cv);
+ mutex_exit(&arg->lock);
+}
+
static void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
+{
+ struct refsarg arg;
+
+ mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
+ arg.gone = FALSE;
+ (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
+ dsl_dataset_refs_gone);
+ dmu_buf_rele(ds->ds_dbuf, tag);
+ mutex_enter(&arg.lock);
+ while (!arg.gone)
+ cv_wait(&arg.cv, &arg.lock);
+ ASSERT(arg.gone);
+ mutex_exit(&arg.lock);
+ ds->ds_dbuf = NULL;
+ ds->ds_phys = NULL;
+ mutex_destroy(&arg.lock);
+ cv_destroy(&arg.cv);
+}
+
+void
+dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
zio_t *zio;
int err;
int after_branch_point = FALSE;
@@ -1091,29 +1448,53 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
dsl_dataset_t *ds_prev = NULL;
uint64_t obj;
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ ASSERT(ds->ds_owner);
ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ /* signal any waiters that this dataset is going away */
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = dsl_reaper;
+ cv_broadcast(&ds->ds_exclusive_cv);
+ mutex_exit(&ds->ds_lock);
+
+ /* Remove our reservation */
+ if (ds->ds_reserved != 0) {
+ uint64_t val = 0;
+ dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+ ASSERT3U(ds->ds_reserved, ==, 0);
+ }
+
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ dsl_pool_ds_destroyed(ds, tx);
+
obj = ds->ds_object;
if (ds->ds_phys->ds_prev_snap_obj != 0) {
if (ds->ds_prev) {
ds_prev = ds->ds_prev;
} else {
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_prev));
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
}
after_branch_point =
(ds_prev->ds_phys->ds_next_snap_obj != obj);
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point &&
+ ds_prev->ds_phys->ds_next_clones_obj != 0) {
+ VERIFY(0 == zap_remove_int(mos,
+ ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(0 == zap_add_int(mos,
+ ds_prev->ds_phys->ds_next_clones_obj,
+ ds->ds_phys->ds_next_snap_obj, tx));
+ }
+ }
+ if (after_branch_point &&
ds->ds_phys->ds_next_snap_obj == 0) {
/* This clone is toast. */
ASSERT(ds_prev->ds_phys->ds_num_children > 1);
@@ -1130,14 +1511,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
blkptr_t bp;
dsl_dataset_t *ds_next;
uint64_t itor = 0;
+ uint64_t old_unique;
+ int64_t used = 0, compressed = 0, uncompressed = 0;
- spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_next));
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+ old_unique = dsl_dataset_unique(ds_next);
+
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
ds_next->ds_phys->ds_prev_snap_obj =
ds->ds_phys->ds_prev_snap_obj;
@@ -1154,8 +1536,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
*
* XXX we're doing this long task with the config lock held
*/
- while (bplist_iterate(&ds_next->ds_deadlist, &itor,
- &bp) == 0) {
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
&bp, tx));
@@ -1170,16 +1551,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
compressed += BP_GET_PSIZE(&bp);
uncompressed += BP_GET_UCSIZE(&bp);
/* XXX check return value? */
- (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ (void) dsl_free(zio, dp, tx->tx_txg,
&bp, NULL, NULL, ARC_NOWAIT);
}
}
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -compressed, -uncompressed, tx);
+
/* free next's deadlist */
bplist_close(&ds_next->ds_deadlist);
bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
/* set next's deadlist to our deadlist */
+ bplist_close(&ds->ds_deadlist);
ds_next->ds_phys->ds_deadlist_obj =
ds->ds_phys->ds_deadlist_obj;
VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
@@ -1200,51 +1588,50 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
* config lock held
*/
dsl_dataset_t *ds_after_next;
+ uint64_t space;
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds_next->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_after_next));
- itor = 0;
- while (bplist_iterate(&ds_after_next->ds_deadlist,
- &itor, &bp) == 0) {
- if (bp.blk_birth >
- ds->ds_phys->ds_prev_snap_txg &&
- bp.blk_birth <=
- ds->ds_phys->ds_creation_txg) {
- ds_next->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- }
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds_next->ds_phys->ds_next_snap_obj,
+ FTAG, &ds_after_next));
+
+ VERIFY(0 ==
+ bplist_space_birthrange(&ds_after_next->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg,
+ ds->ds_phys->ds_creation_txg, &space));
+ ds_next->ds_phys->ds_unique_bytes += space;
- dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+ dsl_dataset_rele(ds_after_next, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
} else {
- /*
- * It would be nice to update the head dataset's
- * unique. To do so we would have to traverse
- * it for blocks born after ds_prev, which is
- * pretty expensive just to maintain something
- * for debugging purposes.
- */
ASSERT3P(ds_next->ds_prev, ==, ds);
- dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
- ds_next);
+ dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
+ ds_next->ds_prev = NULL;
if (ds_prev) {
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds_next, &ds_next->ds_prev));
- } else {
- ds_next->ds_prev = NULL;
+ VERIFY(0 == dsl_dataset_get_ref(dp,
+ ds->ds_phys->ds_prev_snap_obj,
+ ds_next, &ds_next->ds_prev));
}
- }
- dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
- /*
- * NB: unique_bytes is not accurate for head objsets
- * because we don't update it when we delete the most
- * recent snapshot -- see above comment.
- */
- ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ dsl_dataset_recalc_head_uniq(ds_next);
+
+ /*
+ * Reduce the amount of our unconsmed refreservation
+ * being charged to our parent by the amount of
+ * new unique data we have gained.
+ */
+ if (old_unique < ds_next->ds_reserved) {
+ int64_t mrsdelta;
+ uint64_t new_unique =
+ ds_next->ds_phys->ds_unique_bytes;
+
+ ASSERT(old_unique <= new_unique);
+ mrsdelta = MIN(new_unique - old_unique,
+ ds_next->ds_reserved - old_unique);
+ dsl_dir_diduse_space(ds->ds_dir,
+ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+ }
+ }
+ dsl_dataset_rele(ds_next, FTAG);
} else {
/*
* There's no next snapshot, so this is a head dataset.
@@ -1263,76 +1650,106 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
- * XXX we're doing this long task with the config lock held
+ * NB: this should be very quick, because we already
+ * freed all the objects in open context.
*/
- ka.usedp = &used;
- ka.compressedp = &compressed;
- ka.uncompressedp = &uncompressed;
+ ka.ds = ds;
ka.zio = zio;
ka.tx = tx;
err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
ADVANCE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
+ ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE ||
+ ds->ds_phys->ds_unique_bytes == 0);
}
err = zio_wait(zio);
ASSERT3U(err, ==, 0);
- dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
-
- if (ds->ds_phys->ds_snapnames_zapobj) {
- err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
- ASSERT(err == 0);
- }
-
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
- /* Erase the link in the dataset */
+ /* Erase the link in the dir */
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
- /*
- * dsl_dir_sync_destroy() called us, they'll destroy
- * the dataset.
- */
+ ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
+ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+ ASSERT(err == 0);
} else {
/* remove from snapshot namespace */
dsl_dataset_t *ds_head;
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_head));
+ ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
VERIFY(0 == dsl_dataset_get_snapname(ds));
#ifdef ZFS_DEBUG
{
uint64_t val;
- err = zap_lookup(mos,
- ds_head->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, 8, 1, &val);
+
+ err = dsl_dataset_snap_lookup(ds_head,
+ ds->ds_snapname, &val);
ASSERT3U(err, ==, 0);
ASSERT3U(val, ==, obj);
}
#endif
- err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, tx);
+ err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
ASSERT(err == 0);
- dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+ dsl_dataset_rele(ds_head, FTAG);
}
if (ds_prev && ds->ds_prev != ds_prev)
- dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
-
- spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+ dsl_dataset_rele(ds_prev, FTAG);
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+ spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+
+ if (ds->ds_phys->ds_next_clones_obj != 0) {
+ uint64_t count;
+ ASSERT(0 == zap_count(mos,
+ ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+ VERIFY(0 == dmu_object_free(mos,
+ ds->ds_phys->ds_next_clones_obj, tx));
+ }
+ if (ds->ds_phys->ds_props_obj != 0)
+ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+ dsl_dir_close(ds->ds_dir, ds);
+ ds->ds_dir = NULL;
+ dsl_dataset_drain_refs(ds, tag);
VERIFY(0 == dmu_object_free(mos, obj, tx));
+}
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t asize;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * If there's an fs-only reservation, any blocks that might become
+ * owned by the snapshot dataset must be accommodated by space
+ * outside of the reservation.
+ */
+ asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+ return (ENOSPC);
+
+ /*
+ * Propogate any reserved space for this snapshot to other
+ * snapshot checks in this sync group.
+ */
+ if (asize > 0)
+ dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+ return (0);
}
/* ARGSUSED */
int
dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = arg1;
const char *snapname = arg2;
- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
int err;
uint64_t value;
@@ -1346,8 +1763,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* Check for conflicting name snapshot name.
*/
- err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
- snapname, 8, 1, &value);
+ err = dsl_dataset_snap_lookup(ds, snapname, &value);
if (err == 0)
return (EEXIST);
if (err != ENOENT)
@@ -1360,34 +1776,44 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
return (ENAMETOOLONG);
+ err = dsl_dataset_snapshot_reserve_space(ds, tx);
+ if (err)
+ return (err);
+
ds->ds_trysnap_txg = tx->tx_txg;
return (0);
}
void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
- objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dataset_t *ds = arg1;
const char *snapname = arg2;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
- uint64_t dsobj;
+ uint64_t dsobj, crtxg;
objset_t *mos = dp->dp_meta_objset;
int err;
- spa_scrub_restart(dp->dp_spa, tx->tx_txg);
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ /*
+ * The origin's ds_creation_txg has to be < TXG_INITIAL
+ */
+ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+ crtxg = 1;
+ else
+ crtxg = tx->tx_txg;
+
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = ds->ds_dir->dd_object;
dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
@@ -1395,7 +1821,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
dsphys->ds_next_snap_obj = ds->ds_object;
dsphys->ds_num_children = 1;
dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_creation_txg = crtxg;
dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
@@ -1406,6 +1832,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
+ uint64_t next_clones_obj =
+ ds->ds_prev->ds_phys->ds_next_clones_obj;
ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
ds->ds_object ||
ds->ds_prev->ds_phys->ds_num_children > 1);
@@ -1414,15 +1842,33 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
ds->ds_prev->ds_phys->ds_creation_txg);
ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ } else if (next_clones_obj != 0) {
+ VERIFY3U(0, ==, zap_remove_int(mos,
+ next_clones_obj, dsphys->ds_next_snap_obj, tx));
+ VERIFY3U(0, ==, zap_add_int(mos,
+ next_clones_obj, dsobj, tx));
}
}
+ /*
+ * If we have a reference-reservation on this dataset, we will
+ * need to increase the amount of refreservation being charged
+ * since our unique space is going to zero.
+ */
+ if (ds->ds_reserved) {
+ int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+ add, 0, 0, tx);
+ }
+
bplist_close(&ds->ds_deadlist);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
ds->ds_phys->ds_prev_snap_obj = dsobj;
- ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+ ds->ds_phys->ds_prev_snap_txg = crtxg;
ds->ds_phys->ds_unique_bytes = 0;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
ds->ds_phys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
@@ -1434,10 +1880,14 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(err == 0);
if (ds->ds_prev)
- dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, snapname,
- DS_MODE_NONE, ds, &ds->ds_prev));
+ dsl_dataset_drop_ref(ds->ds_prev, ds);
+ VERIFY(0 == dsl_dataset_get_ref(dp,
+ ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+ dsl_pool_ds_snapshotted(ds, tx);
+
+ spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+ "dataset = %llu", dsobj);
}
void
@@ -1447,22 +1897,38 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ASSERT(ds->ds_user_ptr != NULL);
ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+ /*
+ * in case we had to change ds_fsid_guid when we opened it,
+ * sync it out now.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+
dsl_dir_dirty(ds->ds_dir, tx);
dmu_objset_sync(ds->ds_user_ptr, zio, tx);
- /* Unneeded? bplist_close(&ds->ds_deadlist); */
}
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
+ uint64_t refd, avail, uobjs, aobjs;
+
dsl_dir_stats(ds->ds_dir, nv);
+ dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
ds->ds_phys->ds_creation_time);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
ds->ds_phys->ds_creation_txg);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
- ds->ds_phys->ds_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+ ds->ds_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+ ds->ds_reserved);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+ ds->ds_phys->ds_guid);
if (ds->ds_phys->ds_next_snap_obj) {
/*
@@ -1483,29 +1949,29 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
{
stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+ stat->dds_guid = ds->ds_phys->ds_guid;
if (ds->ds_phys->ds_next_snap_obj) {
stat->dds_is_snapshot = B_TRUE;
stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
}
/* clone origin is really a dsl_dir thing... */
- if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ if (dsl_dir_is_clone(ds->ds_dir)) {
dsl_dataset_t *ods;
- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
- VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
- ds->ds_dir->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_NONE, FTAG, &ods));
- dsl_dataset_name(ods, stat->dds_clone_of);
- dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+ dsl_dataset_name(ods, stat->dds_origin);
+ dsl_dataset_drop_ref(ods, FTAG);
}
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
}
uint64_t
dsl_dataset_fsid_guid(dsl_dataset_t *ds)
{
- return (ds->ds_phys->ds_fsid_guid);
+ return (ds->ds_fsid_guid);
}
void
@@ -1515,10 +1981,37 @@ dsl_dataset_space(dsl_dataset_t *ds,
{
*refdbytesp = ds->ds_phys->ds_used_bytes;
*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
+ *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (*refdbytesp < ds->ds_quota)
+ *availbytesp = MIN(*availbytesp,
+ ds->ds_quota - *refdbytesp);
+ else
+ *availbytesp = 0;
+ }
*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
+boolean_t
+dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+ if (ds->ds_prev == NULL)
+ return (B_FALSE);
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds->ds_prev->ds_phys->ds_creation_txg)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
/* ARGSUSED */
static int
dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1526,20 +2019,18 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
dsl_dataset_t *ds = arg1;
char *newsnapname = arg2;
dsl_dir_t *dd = ds->ds_dir;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
dsl_dataset_t *hds;
uint64_t val;
int err;
- err = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
+ err = dsl_dataset_hold_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
if (err)
return (err);
/* new name better not be in use */
- err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
- newsnapname, 8, 1, &val);
- dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+ err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
+ dsl_dataset_rele(hds, FTAG);
if (err == 0)
err = EEXIST;
@@ -1554,10 +2045,11 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
+ cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- char *newsnapname = arg2;
+ const char *newsnapname = arg2;
dsl_dir_t *dd = ds->ds_dir;
objset_t *mos = dd->dd_pool->dp_meta_objset;
dsl_dataset_t *hds;
@@ -1565,12 +2057,11 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
+ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
VERIFY(0 == dsl_dataset_get_snapname(ds));
- err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, tx);
+ err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
ASSERT3U(err, ==, 0);
mutex_enter(&ds->ds_lock);
(void) strcpy(ds->ds_snapname, newsnapname);
@@ -1579,10 +2070,12 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ds->ds_snapname, 8, 1, &ds->ds_object, tx);
ASSERT3U(err, ==, 0);
- dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+ spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+ dsl_dataset_rele(hds, FTAG);
}
-struct renamearg {
+struct renamesnaparg {
dsl_sync_task_group_t *dstg;
char failed[MAXPATHLEN];
char *oldsnap;
@@ -1592,7 +2085,7 @@ struct renamearg {
static int
dsl_snapshot_rename_one(char *name, void *arg)
{
- struct renamearg *ra = arg;
+ struct renamesnaparg *ra = arg;
dsl_dataset_t *ds = NULL;
char *cp;
int err;
@@ -1600,25 +2093,33 @@ dsl_snapshot_rename_one(char *name, void *arg)
cp = name + strlen(name);
*cp = '@';
(void) strcpy(cp + 1, ra->oldsnap);
- err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
- ra->dstg, &ds);
+
+ /*
+ * For recursive snapshot renames the parent won't be changing
+ * so we just pass name for both the to/from argument.
+ */
+ err = zfs_secpolicy_rename_perms(name, name, CRED());
if (err == ENOENT) {
- *cp = '\0';
return (0);
- }
- if (err) {
+ } else if (err) {
(void) strcpy(ra->failed, name);
- *cp = '\0';
- dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
return (err);
}
#ifdef _KERNEL
- /* for all filesystems undergoing rename, we'll need to unmount it */
+ /*
+ * For all filesystems undergoing rename, we'll need to unmount it.
+ */
(void) zfs_unmount_snap(name, NULL);
#endif
-
+ err = dsl_dataset_hold(name, ra->dstg, &ds);
*cp = '\0';
+ if (err == ENOENT) {
+ return (0);
+ } else if (err) {
+ (void) strcpy(ra->failed, name);
+ return (err);
+ }
dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -1630,7 +2131,7 @@ static int
dsl_recursive_rename(char *oldname, const char *newname)
{
int err;
- struct renamearg *ra;
+ struct renamesnaparg *ra;
dsl_sync_task_t *dst;
spa_t *spa;
char *cp, *fsname = spa_strdup(oldname);
@@ -1640,19 +2141,12 @@ dsl_recursive_rename(char *oldname, const char *newname)
cp = strchr(fsname, '@');
*cp = '\0';
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
+ err = spa_open(fsname, &spa, FTAG);
if (err) {
kmem_free(fsname, len + 1);
return (err);
}
- ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+ ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
ra->oldsnap = strchr(oldname, '@') + 1;
@@ -1675,21 +2169,32 @@ dsl_recursive_rename(char *oldname, const char *newname)
(void) strcat(ra->failed, "@");
(void) strcat(ra->failed, ra->newsnap);
}
- dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+ dsl_dataset_rele(ds, ra->dstg);
}
- (void) strcpy(oldname, ra->failed);
+ if (err)
+ (void) strcpy(oldname, ra->failed);
dsl_sync_task_group_destroy(ra->dstg);
- kmem_free(ra, sizeof (struct renamearg));
+ kmem_free(ra, sizeof (struct renamesnaparg));
spa_close(spa, FTAG);
return (err);
}
+static int
+dsl_valid_rename(char *oldname, void *arg)
+{
+ int delta = *(int *)arg;
+
+ if (strlen(oldname) + delta >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ return (0);
+}
+
#pragma weak dmu_objset_rename = dsl_dataset_rename
int
-dsl_dataset_rename(char *oldname, const char *newname,
- boolean_t recursive)
+dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
{
dsl_dir_t *dd;
dsl_dataset_t *ds;
@@ -1700,7 +2205,15 @@ dsl_dataset_rename(char *oldname, const char *newname,
if (err)
return (err);
if (tail == NULL) {
- err = dsl_dir_rename(dd, newname);
+ int delta = strlen(newname) - strlen(oldname);
+
+ /* if we're growing, validate child name lengths */
+ if (delta > 0)
+ err = dmu_objset_find(oldname, dsl_valid_rename,
+ &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+ if (!err)
+ err = dsl_dir_rename(dd, newname);
dsl_dir_close(dd, FTAG);
return (err);
}
@@ -1723,8 +2236,7 @@ dsl_dataset_rename(char *oldname, const char *newname,
if (recursive) {
err = dsl_recursive_rename(oldname, newname);
} else {
- err = dsl_dataset_open(oldname,
- DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+ err = dsl_dataset_hold(oldname, FTAG, &ds);
if (err)
return (err);
@@ -1732,278 +2244,640 @@ dsl_dataset_rename(char *oldname, const char *newname,
dsl_dataset_snapshot_rename_check,
dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
- dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+ dsl_dataset_rele(ds, FTAG);
}
return (err);
}
+struct promotenode {
+ list_node_t link;
+ dsl_dataset_t *ds;
+};
+
struct promotearg {
- uint64_t used, comp, uncomp, unique;
- uint64_t newnext_obj, snapnames_obj;
+ list_t shared_snaps, origin_snaps, clone_snaps;
+ dsl_dataset_t *origin_origin, *origin_head;
+ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
};
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+
+/* ARGSUSED */
static int
dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *hds = arg1;
struct promotearg *pa = arg2;
- dsl_dir_t *dd = hds->ds_dir;
- dsl_pool_t *dp = hds->ds_dir->dd_pool;
- dsl_dir_t *pdd = NULL;
- dsl_dataset_t *ds = NULL;
- dsl_dataset_t *pivot_ds = NULL;
- dsl_dataset_t *newnext_ds = NULL;
+ struct promotenode *snap = list_head(&pa->shared_snaps);
+ dsl_dataset_t *origin_ds = snap->ds;
int err;
- char *name = NULL;
- uint64_t itor = 0;
- blkptr_t bp;
-
- bzero(pa, sizeof (*pa));
- /* Check that it is a clone */
- if (dd->dd_phys->dd_clone_parent_obj == 0)
+ /* Check that it is a real clone */
+ if (!dsl_dir_is_clone(hds->ds_dir))
return (EINVAL);
/* Since this is so expensive, don't do the preliminary check */
if (!dmu_tx_is_syncing(tx))
return (0);
- if (err = dsl_dataset_open_obj(dp,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
- goto out;
- pdd = pivot_ds->ds_dir;
-
- {
- dsl_dataset_t *phds;
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- pdd->dd_phys->dd_head_dataset_obj,
- NULL, DS_MODE_NONE, FTAG, &phds))
- goto out;
- pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
- dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
- }
-
- if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
- err = EXDEV;
- goto out;
- }
-
- /* find pivot point's new next ds */
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
- NULL, DS_MODE_NONE, FTAG, &newnext_ds));
- while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
- dsl_dataset_t *prev;
-
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- newnext_ds->ds_phys->ds_prev_snap_obj,
- NULL, DS_MODE_NONE, FTAG, &prev))
- goto out;
- dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
- newnext_ds = prev;
- }
- pa->newnext_obj = newnext_ds->ds_object;
+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
+ return (EXDEV);
- /* compute pivot point's new unique space */
- while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
- &itor, &bp)) == 0) {
- if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
- pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
- }
- if (err != ENOENT)
- goto out;
+ /* compute origin's new unique space */
+ snap = list_tail(&pa->clone_snaps);
+ ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+ err = bplist_space_birthrange(&snap->ds->ds_deadlist,
+ origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
+ if (err)
+ return (err);
- /* Walk the snapshots that we are moving */
- name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- ds = pivot_ds;
- /* CONSTCOND */
- while (TRUE) {
+ /*
+ * Walk the snapshots that we are moving
+ *
+ * Compute space to transfer. Consider the incremental changes
+ * to used for each snapshot:
+ * (my used) = (prev's used) + (blocks born) - (blocks killed)
+ * So each snapshot gave birth to:
+ * (blocks born) = (my used) - (prev's used) + (blocks killed)
+ * So a sequence would look like:
+ * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+ * Which simplifies to:
+ * uN + kN + kN-1 + ... + k1 + k0
+ * Note however, if we stop before we reach the ORIGIN we get:
+ * uN + kN + kN-1 + ... + kM - uM-1
+ */
+ pa->used = origin_ds->ds_phys->ds_used_bytes;
+ pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
+ pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+ for (snap = list_head(&pa->shared_snaps); snap;
+ snap = list_next(&pa->shared_snaps, snap)) {
uint64_t val, dlused, dlcomp, dluncomp;
- dsl_dataset_t *prev;
+ dsl_dataset_t *ds = snap->ds;
/* Check that the snapshot name does not conflict */
- dsl_dataset_name(ds, name);
- err = zap_lookup(dd->dd_pool->dp_meta_objset,
- hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
- 8, 1, &val);
- if (err != ENOENT) {
- if (err == 0)
- err = EEXIST;
- goto out;
- }
-
- /*
- * compute space to transfer. Each snapshot gave birth to:
- * (my used) - (prev's used) + (deadlist's used)
- */
- pa->used += ds->ds_phys->ds_used_bytes;
- pa->comp += ds->ds_phys->ds_compressed_bytes;
- pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
- /* If we reach the first snapshot, we're done. */
+ /* The very first snapshot does not have a deadlist */
if (ds->ds_phys->ds_prev_snap_obj == 0)
- break;
+ continue;
if (err = bplist_space(&ds->ds_deadlist,
&dlused, &dlcomp, &dluncomp))
- goto out;
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
- FTAG, &prev))
- goto out;
- pa->used += dlused - prev->ds_phys->ds_used_bytes;
- pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
- pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
+ return (err);
+ pa->used += dlused;
+ pa->comp += dlcomp;
+ pa->uncomp += dluncomp;
+ }
- /*
- * We could be a clone of a clone. If we reach our
- * parent's branch point, we're done.
- */
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
- break;
- }
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- ds = prev;
+ /*
+ * If we are a clone of a clone then we never reached ORIGIN,
+ * so we need to subtract out the clone origin's used space.
+ */
+ if (pa->origin_origin) {
+ pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
+ pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
+ pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
}
/* Check that there is enough space here */
- err = dsl_dir_transfer_possible(pdd, dd, pa->used);
+ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+ pa->used);
+ if (err)
+ return (err);
-out:
- if (ds && ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- if (pivot_ds)
- dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
- if (newnext_ds)
- dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
- if (name)
- kmem_free(name, MAXPATHLEN);
- return (err);
+ /*
+ * Compute the amounts of space that will be used by snapshots
+ * after the promotion (for both origin and clone). For each,
+ * it is the amount of space that will be on all of their
+ * deadlists (that was not born before their new origin).
+ */
+ if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ uint64_t space;
+
+ /*
+ * Note, typically this will not be a clone of a clone,
+ * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
+ * these snaplist_space() -> bplist_space_birthrange()
+ * calls will be fast because they do not have to
+ * iterate over all bps.
+ */
+ snap = list_head(&pa->origin_snaps);
+ err = snaplist_space(&pa->shared_snaps,
+ snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+ if (err)
+ return (err);
+
+ err = snaplist_space(&pa->clone_snaps,
+ snap->ds->ds_origin_txg, &space);
+ if (err)
+ return (err);
+ pa->cloneusedsnap += space;
+ }
+ if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ err = snaplist_space(&pa->origin_snaps,
+ origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
+ if (err)
+ return (err);
+ }
+
+ return (0);
}
static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *hds = arg1;
struct promotearg *pa = arg2;
+ struct promotenode *snap = list_head(&pa->shared_snaps);
+ dsl_dataset_t *origin_ds = snap->ds;
+ dsl_dataset_t *origin_head;
dsl_dir_t *dd = hds->ds_dir;
dsl_pool_t *dp = hds->ds_dir->dd_pool;
- dsl_dir_t *pdd = NULL;
- dsl_dataset_t *ds, *pivot_ds;
- char *name;
+ dsl_dir_t *odd = NULL;
+ uint64_t oldnext_obj;
+ int64_t delta;
- ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
- VERIFY(0 == dsl_dataset_open_obj(dp,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
+ snap = list_head(&pa->origin_snaps);
+ origin_head = snap->ds;
+
/*
- * We need to explicitly open pdd, since pivot_ds's pdd will be
+ * We need to explicitly open odd, since origin_ds's dd will be
* changing.
*/
- VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
- NULL, FTAG, &pdd));
+ VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+ NULL, FTAG, &odd));
+
+ /* change origin's next snap */
+ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+ oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+ snap = list_tail(&pa->clone_snaps);
+ ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+ origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+
+ /* change the origin's next clone */
+ if (origin_ds->ds_phys->ds_next_clones_obj) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ origin_ds->ds_phys->ds_next_clones_obj,
+ origin_ds->ds_phys->ds_next_snap_obj, tx));
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ origin_ds->ds_phys->ds_next_clones_obj,
+ oldnext_obj, tx));
+ }
- /* move snapshots to this dir */
- name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- ds = pivot_ds;
- /* CONSTCOND */
- while (TRUE) {
- dsl_dataset_t *prev;
+ /* change origin */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
+ dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+ hds->ds_origin_txg = origin_head->ds_origin_txg;
+ dmu_buf_will_dirty(odd->dd_dbuf, tx);
+ odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+ origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+ /* move snapshots to this dir */
+ for (snap = list_head(&pa->shared_snaps); snap;
+ snap = list_next(&pa->shared_snaps, snap)) {
+ dsl_dataset_t *ds = snap->ds;
+
+ /* unregister props as dsl_dir is changing */
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
/* move snap name entry */
- dsl_dataset_name(ds, name);
- VERIFY(0 == zap_remove(dp->dp_meta_objset,
- pa->snapnames_obj, ds->ds_snapname, tx));
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ VERIFY(0 == dsl_dataset_snap_remove(origin_head,
+ ds->ds_snapname, tx));
VERIFY(0 == zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
-
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
+ ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
ds->ds_phys->ds_dir_obj = dd->dd_object;
- ASSERT3P(ds->ds_dir, ==, pdd);
+ ASSERT3P(ds->ds_dir, ==, odd);
dsl_dir_close(ds->ds_dir, ds);
VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+ }
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- break;
+ /*
+ * Change space accounting.
+ * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+ * both be valid, or both be 0 (resulting in delta == 0). This
+ * is true for each of {clone,origin} independently.
+ */
+
+ delta = pa->cloneusedsnap -
+ dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, >=, 0);
+ ASSERT3U(pa->used, >=, delta);
+ dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(dd, DD_USED_HEAD,
+ pa->used - delta, pa->comp, pa->uncomp, tx);
+
+ delta = pa->originusedsnap -
+ odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, <=, 0);
+ ASSERT3U(pa->used, >=, -delta);
+ dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(odd, DD_USED_HEAD,
+ -pa->used - delta, -pa->comp, -pa->uncomp, tx);
+
+ origin_ds->ds_phys->ds_unique_bytes = pa->unique;
+
+ /* log history record */
+ spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+ cr, "dataset = %llu", hds->ds_object);
+
+ dsl_dir_close(odd, FTAG);
+}
+
+static char *snaplist_tag = "snaplist";
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive). The list will be in reverse
+ * order (last_obj will be the list_head()). If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp, boolean_t own,
+ uint64_t first_obj, uint64_t last_obj, list_t *l)
+{
+ uint64_t obj = last_obj;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
+
+ list_create(l, sizeof (struct promotenode),
+ offsetof(struct promotenode, link));
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
- FTAG, &prev));
+ while (obj != first_obj) {
+ dsl_dataset_t *ds;
+ struct promotenode *snap;
+ int err;
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
- break;
+ if (own) {
+ err = dsl_dataset_own_obj(dp, obj,
+ 0, snaplist_tag, &ds);
+ if (err == 0)
+ dsl_dataset_make_exclusive(ds, snaplist_tag);
+ } else {
+ err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
+ }
+ if (err == ENOENT) {
+ /* lost race with snapshot destroy */
+ struct promotenode *last = list_tail(l);
+ ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
+ obj = last->ds->ds_phys->ds_prev_snap_obj;
+ continue;
+ } else if (err) {
+ return (err);
}
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- ds = prev;
+
+ if (first_obj == 0)
+ first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+
+ snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
+ snap->ds = ds;
+ list_insert_tail(l, snap);
+ obj = ds->ds_phys->ds_prev_snap_obj;
}
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- /* change pivot point's next snap */
- dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
- pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+ return (0);
+}
- /* change clone_parent-age */
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
- dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
- dmu_buf_will_dirty(pdd->dd_dbuf, tx);
- pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+ struct promotenode *snap;
- /* change space accounting */
- dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
- dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
- pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
+ *spacep = 0;
+ for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+ uint64_t used;
+ int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used);
+ if (err)
+ return (err);
+ *spacep += used;
+ }
+ return (0);
+}
- dsl_dir_close(pdd, FTAG);
- dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
- kmem_free(name, MAXPATHLEN);
+static void
+snaplist_destroy(list_t *l, boolean_t own)
+{
+ struct promotenode *snap;
+
+ if (!list_link_active(&l->list_head))
+ return;
+
+ while ((snap = list_tail(l)) != NULL) {
+ list_remove(l, snap);
+ if (own)
+ dsl_dataset_disown(snap->ds, snaplist_tag);
+ else
+ dsl_dataset_rele(snap->ds, snaplist_tag);
+ kmem_free(snap, sizeof (struct promotenode));
+ }
+ list_destroy(l);
}
+/*
+ * Promote a clone. Nomenclature note:
+ * "clone" or "cds": the original clone which is being promoted
+ * "origin" or "ods": the snapshot which is originally clone's origin
+ * "origin head" or "ohds": the dataset which is the head
+ * (filesystem/volume) for the origin
+ * "origin origin": the origin of the origin's filesystem (typically
+ * NULL, indicating that the clone is not a clone of a clone).
+ */
int
dsl_dataset_promote(const char *name)
{
dsl_dataset_t *ds;
- int err;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
dmu_object_info_t doi;
- struct promotearg pa;
+ struct promotearg pa = { 0 };
+ struct promotenode *snap;
+ int err;
- err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
+ err = dsl_dataset_hold(name, FTAG, &ds);
if (err)
return (err);
+ dd = ds->ds_dir;
+ dp = dd->dd_pool;
- err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
+ err = dmu_object_info(dp->dp_meta_objset,
ds->ds_phys->ds_snapnames_zapobj, &doi);
if (err) {
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
+ if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EINVAL);
+ }
+
+ /*
+ * We are going to inherit all the snapshots taken before our
+ * origin (i.e., our new origin will be our parent's origin).
+ * Take ownership of them so that we can rename them into our
+ * namespace.
+ */
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+ err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
+ &pa.shared_snaps);
+ if (err != 0)
+ goto out;
+
+ err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
+ if (err != 0)
+ goto out;
+
+ snap = list_head(&pa.shared_snaps);
+ ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
+ err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
+ snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
+ if (err != 0)
+ goto out;
+
+ if (dsl_dir_is_clone(snap->ds->ds_dir)) {
+ err = dsl_dataset_own_obj(dp,
+ snap->ds->ds_dir->dd_phys->dd_origin_obj,
+ 0, FTAG, &pa.origin_origin);
+ if (err != 0)
+ goto out;
+ }
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+
/*
* Add in 128x the snapnames zapobj size, since we will be moving
* a bunch of snapnames to the promoted ds, and dirtying their
* bonus buffers.
*/
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_promote_check,
- dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ if (err == 0) {
+ err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, ds, &pa,
+ 2 + 2 * doi.doi_physical_blks);
+ }
+
+ snaplist_destroy(&pa.shared_snaps, B_TRUE);
+ snaplist_destroy(&pa.clone_snaps, B_FALSE);
+ snaplist_destroy(&pa.origin_snaps, B_FALSE);
+ if (pa.origin_origin)
+ dsl_dataset_disown(pa.origin_origin, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
+struct cloneswaparg {
+ dsl_dataset_t *cds; /* clone dataset */
+ dsl_dataset_t *ohds; /* origin's head dataset */
+ boolean_t force;
+ int64_t unused_refres_delta; /* change in unconsumed refreservation */
+};
+
+/* ARGSUSED */
+static int
+dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ struct cloneswaparg *csa = arg1;
+
+ /* they should both be heads */
+ if (dsl_dataset_is_snapshot(csa->cds) ||
+ dsl_dataset_is_snapshot(csa->ohds))
+ return (EINVAL);
+
+ /* the branch point should be just before them */
+ if (csa->cds->ds_prev != csa->ohds->ds_prev)
+ return (EINVAL);
+
+ /* cds should be the clone */
+ if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
+ csa->ohds->ds_object)
+ return (EINVAL);
+
+ /* the clone should be a child of the origin */
+ if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
+ return (EINVAL);
+
+ /* ohds shouldn't be modified unless 'force' */
+ if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
+ return (ETXTBSY);
+
+ /* adjust amount of any unconsumed refreservation */
+ csa->unused_refres_delta =
+ (int64_t)MIN(csa->ohds->ds_reserved,
+ csa->ohds->ds_phys->ds_unique_bytes) -
+ (int64_t)MIN(csa->ohds->ds_reserved,
+ csa->cds->ds_phys->ds_unique_bytes);
+
+ if (csa->unused_refres_delta > 0 &&
+ csa->unused_refres_delta >
+ dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
+ return (ENOSPC);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ struct cloneswaparg *csa = arg1;
+ dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
+
+ ASSERT(csa->cds->ds_reserved == 0);
+ ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+
+ dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
+ dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
+ dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
+
+ if (csa->cds->ds_user_ptr != NULL) {
+ csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
+ csa->cds->ds_user_ptr = NULL;
+ }
+
+ if (csa->ohds->ds_user_ptr != NULL) {
+ csa->ohds->ds_user_evict_func(csa->ohds,
+ csa->ohds->ds_user_ptr);
+ csa->ohds->ds_user_ptr = NULL;
+ }
+
+ /* reset origin's unique bytes */
+ VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+ csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+
+ /* swap blkptrs */
+ {
+ blkptr_t tmp;
+ tmp = csa->ohds->ds_phys->ds_bp;
+ csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
+ csa->cds->ds_phys->ds_bp = tmp;
+ }
+
+ /* set dd_*_bytes */
+ {
+ int64_t dused, dcomp, duncomp;
+ uint64_t cdl_used, cdl_comp, cdl_uncomp;
+ uint64_t odl_used, odl_comp, odl_uncomp;
+
+ ASSERT3U(csa->cds->ds_dir->dd_phys->
+ dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+ VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
+ &cdl_comp, &cdl_uncomp));
+ VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
+ &odl_comp, &odl_uncomp));
+
+ dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
+ (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+ dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
+ (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
+ duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
+ cdl_uncomp -
+ (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
+ dused, dcomp, duncomp, tx);
+ dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
+ -dused, -dcomp, -duncomp, tx);
+
+ /*
+ * The difference in the space used by snapshots is the
+ * difference in snapshot space due to the head's
+ * deadlist (since that's the only thing that's
+ * changing that affects the snapused).
+ */
+ VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+ csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
+ VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
+ csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+ dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
+ }
+
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
+ /* swap ds_*_bytes */
+ SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
+ csa->cds->ds_phys->ds_used_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
+ csa->cds->ds_phys->ds_compressed_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
+ csa->cds->ds_phys->ds_uncompressed_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
+ csa->cds->ds_phys->ds_unique_bytes);
+
+ /* apply any parent delta for change in unconsumed refreservation */
+ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
+ csa->unused_refres_delta, 0, 0, tx);
+
+ /* swap deadlists */
+ bplist_close(&csa->cds->ds_deadlist);
+ bplist_close(&csa->ohds->ds_deadlist);
+ SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
+ csa->cds->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+ csa->cds->ds_phys->ds_deadlist_obj));
+ VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+ csa->ohds->ds_phys->ds_deadlist_obj));
+}
+
+/*
+ * Swap 'clone' with its origin head file system. Used at the end
+ * of "online recv" to swizzle the file system to the new version.
+ */
+int
+dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+ boolean_t force)
+{
+ struct cloneswaparg csa;
+ int error;
+
+ ASSERT(clone->ds_owner);
+ ASSERT(origin_head->ds_owner);
+retry:
+ /* Need exclusive access for the swap */
+ rw_enter(&clone->ds_rwlock, RW_WRITER);
+ if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+ rw_exit(&clone->ds_rwlock);
+ rw_enter(&origin_head->ds_rwlock, RW_WRITER);
+ if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
+ rw_exit(&origin_head->ds_rwlock);
+ goto retry;
+ }
+ }
+ csa.cds = clone;
+ csa.ohds = origin_head;
+ csa.force = force;
+ error = dsl_sync_task_do(clone->ds_dir->dd_pool,
+ dsl_dataset_clone_swap_check,
+ dsl_dataset_clone_swap_sync, &csa, NULL, 9);
+ return (error);
+}
+
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
@@ -2013,23 +2887,220 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
{
spa_t *spa;
dsl_pool_t *dp;
- dsl_dataset_t *ds = NULL;
+ dsl_dataset_t *ds;
int error;
if ((error = spa_open(pname, &spa, FTAG)) != 0)
return (error);
dp = spa_get_dsl(spa);
rw_enter(&dp->dp_config_rwlock, RW_READER);
- if ((error = dsl_dataset_open_obj(dp, obj,
- NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
- rw_exit(&dp->dp_config_rwlock);
- spa_close(spa, FTAG);
- return (error);
+ if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_rele(ds, FTAG);
}
- dsl_dataset_name(ds, buf);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
rw_exit(&dp->dp_config_rwlock);
spa_close(spa, FTAG);
+ return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+ int error = 0;
+
+ ASSERT3S(asize, >, 0);
+
+ /*
+ * *ref_rsrv is the portion of asize that will come from any
+ * unconsumed refreservation space.
+ */
+ *ref_rsrv = 0;
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Make a space adjustment for reserved bytes.
+ */
+ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+ ASSERT3U(*used, >=,
+ ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+ *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+ *ref_rsrv =
+ asize - MIN(asize, parent_delta(ds, asize + inflight));
+ }
+
+ if (!check_quota || ds->ds_quota == 0) {
+ mutex_exit(&ds->ds_lock);
+ return (0);
+ }
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
+ if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+ error = ERESTART;
+ else
+ error = EDQUOT;
+ }
+ mutex_exit(&ds->ds_lock);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
+ return (ENOTSUP);
+
+ if (new_quota == 0)
+ return (0);
+
+ if (new_quota < ds->ds_phys->ds_used_bytes ||
+ new_quota < ds->ds_reserved)
+ return (ENOSPC);
+
return (0);
}
+
+/* ARGSUSED */
+void
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ ds->ds_quota = new_quota;
+
+ dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+
+ spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu ",
+ (longlong_t)new_quota, ds->ds_object);
+}
+
+int
+dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (quota != ds->ds_quota) {
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+ ds, &quota, 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
+
+static int
+dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ int64_t delta;
+ uint64_t unique;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_REFRESERVATION)
+ return (ENOTSUP);
+
+ if (dsl_dataset_is_snapshot(ds))
+ return (EINVAL);
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ mutex_enter(&ds->ds_lock);
+ unique = dsl_dataset_unique(ds);
+ delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
+ mutex_exit(&ds->ds_lock);
+
+ if (delta > 0 &&
+ delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+ return (ENOSPC);
+ if (delta > 0 && ds->ds_quota > 0 &&
+ new_reservation > ds->ds_quota)
+ return (ENOSPC);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
+ dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t unique;
+ int64_t delta;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ mutex_enter(&ds->ds_dir->dd_lock);
+ mutex_enter(&ds->ds_lock);
+ unique = dsl_dataset_unique(ds);
+ delta = MAX(0, (int64_t)(new_reservation - unique)) -
+ MAX(0, (int64_t)(ds->ds_reserved - unique));
+ ds->ds_reserved = new_reservation;
+ mutex_exit(&ds->ds_lock);
+
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
+ dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+ new_reservation, cr, tx);
+
+ spa_history_internal_log(LOG_DS_REFRESERV,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+ (longlong_t)new_reservation, ds->ds_object);
+}
+
+int
+dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_reservation_check,
+ dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
new file mode 100644
index 000000000000..2ce16fe20e12
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -0,0 +1,735 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism. The first level identifies the "class" of
+ * entry. The class is identified by the first 2 letters of
+ * the attribute. The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission. The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id> identifies permissions granted locally for this userid.
+ * ud$<id> identifies permissions granted on descendent datasets for
+ * this userid.
+ * Ul$<id> identifies permission sets granted locally for this userid.
+ * Ud$<id> identifies permission sets granted on descendent datasets for
+ * this userid.
+ * gl$<id> identifies permissions granted locally for this groupid.
+ * gd$<id> identifies permissions granted on descendent datasets for
+ * this groupid.
+ * Gl$<id> identifies permission sets granted locally for this groupid.
+ * Gd$<id> identifies permission sets granted on descendent datasets for
+ * this groupid.
+ * el$ identifies permissions granted locally for everyone.
+ * ed$ identifies permissions granted on descendent datasets
+ * for everyone.
+ * El$ identifies permission sets granted locally for everyone.
+ * Ed$ identifies permission sets granted to descendent datasets for
+ * everyone.
+ * c-$ identifies permission to create at dataset creation time.
+ * C-$ identifies permission sets to grant locally at dataset creation
+ * time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+
+ if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+ return (EPERM);
+
+ if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions. They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+ char idstr[32];
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ (void) snprintf(idstr, sizeof (idstr), "%lld",
+ (longlong_t)crgetuid(cr));
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+ if (type != ZFS_DELEG_USER &&
+ type != ZFS_DELEG_USER_SETS)
+ return (EPERM);
+
+ if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+ return (EPERM);
+ }
+ return (0);
+}
+
+static void
+dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ nvlist_t *nvp = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(zap_update(mos, zapobj,
+ whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ VERIFY(zap_update(mos, jumpobj,
+ perm, 8, 1, &n, tx) == 0);
+ spa_history_internal_log(LOG_DS_PERM_UPDATE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s %s dataset = %llu", whokey, perm,
+ dd->dd_phys->dd_head_dataset_obj);
+ }
+ }
+}
+
+static void
+dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ nvlist_t *nvp = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ return;
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ if (nvpair_value_nvlist(whopair, &perms) != 0) {
+ if (zap_lookup(mos, zapobj, whokey, 8,
+ 1, &jumpobj) == 0) {
+ (void) zap_remove(mos, zapobj, whokey, tx);
+ VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+ }
+ spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s dataset = %llu", whokey,
+ dd->dd_phys->dd_head_dataset_obj);
+ continue;
+ }
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+ continue;
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ (void) zap_remove(mos, jumpobj, perm, tx);
+ if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+ (void) zap_remove(mos, zapobj,
+ whokey, tx);
+ VERIFY(0 == zap_destroy(mos,
+ jumpobj, tx));
+ }
+ spa_history_internal_log(LOG_DS_PERM_REMOVE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s %s dataset = %llu", whokey, perm,
+ dd->dd_phys->dd_head_dataset_obj);
+ }
+ }
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+ dsl_dir_t *dd;
+ int error;
+ nvpair_t *whopair = NULL;
+ int blocks_modified = 0;
+
+ error = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (error)
+ return (error);
+
+ if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ dsl_dir_close(dd, FTAG);
+ return (ENOTSUP);
+ }
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair))
+ blocks_modified++;
+
+ error = dsl_sync_task_do(dd->dd_pool, NULL,
+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+ dd, nvp, blocks_modified);
+ dsl_dir_close(dd, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order. For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+ dsl_dir_t *dd, *startdd;
+ dsl_pool_t *dp;
+ int error;
+ objset_t *mos;
+
+ error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
+ if (error)
+ return (error);
+
+ dp = startdd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+ zap_cursor_t basezc;
+ zap_attribute_t baseza;
+ nvlist_t *sp_nvp;
+ uint64_t n;
+ char source[MAXNAMELEN];
+
+ if (dd->dd_phys->dd_deleg_zapobj &&
+ (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
+ &n) == 0) && n) {
+ VERIFY(nvlist_alloc(&sp_nvp,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ } else {
+ continue;
+ }
+
+ for (zap_cursor_init(&basezc, mos,
+ dd->dd_phys->dd_deleg_zapobj);
+ zap_cursor_retrieve(&basezc, &baseza) == 0;
+ zap_cursor_advance(&basezc)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ nvlist_t *perms_nvp;
+
+ ASSERT(baseza.za_integer_length == 8);
+ ASSERT(baseza.za_num_integers == 1);
+
+ VERIFY(nvlist_alloc(&perms_nvp,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY(nvlist_add_boolean(perms_nvp,
+ za.za_name) == 0);
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name,
+ perms_nvp) == 0);
+ nvlist_free(perms_nvp);
+ }
+
+ zap_cursor_fini(&basezc);
+
+ dsl_dir_name(dd, source);
+ VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+ nvlist_free(sp_nvp);
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ dsl_dir_close(startdd, FTAG);
+ return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+ avl_node_t p_node;
+ boolean_t p_matched;
+ char p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+ const perm_set_t *node1 = arg1;
+ const perm_set_t *node2 = arg2;
+ int val;
+
+ val = strcmp(node1->p_setname, node2->p_setname);
+ if (val == 0)
+ return (0);
+ return (val > 0 ? 1 : -1);
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved. i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, const char *perm)
+{
+ int error;
+ uint64_t jumpobj, zero;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error == 0) {
+ error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+ if (error == ENOENT)
+ error = EPERM;
+ }
+ return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+ int checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids;
+ int i;
+ uint64_t id;
+
+ /* check for user */
+ id = crgetuid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for users primary group */
+ id = crgetgid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for everyone entry */
+ id = -1;
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check each supplemental group user is a member of */
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+ }
+
+ return (EPERM);
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ perm_set_t *permnode;
+ avl_index_t idx;
+ uint64_t jumpobj;
+ int error;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error != 0)
+ return (error);
+
+ for (zap_cursor_init(&zc, mos, jumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+ (void) strlcpy(permnode->p_setname, za.za_name,
+ sizeof (permnode->p_setname));
+ permnode->p_matched = B_FALSE;
+
+ if (avl_find(avl, permnode, &idx) == NULL) {
+ avl_insert(avl, permnode, idx);
+ } else {
+ kmem_free(permnode, sizeof (perm_set_t));
+ }
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+ char checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids, i;
+ uint64_t id;
+
+ id = crgetuid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+ id = crgetgid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+ }
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ void *cookie;
+ int error;
+ char checkflag = ZFS_DELEG_LOCAL;
+ objset_t *mos;
+ avl_tree_t permsets;
+ perm_set_t *setnode;
+
+ error = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (error)
+ return (error);
+
+ dp = ds->ds_dir->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ if (dsl_delegation_on(mos) == B_FALSE) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ECANCELED);
+ }
+
+ if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EPERM);
+ }
+
+ avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+ offsetof(perm_set_t, p_node));
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
+ checkflag = ZFS_DELEG_DESCENDENT) {
+ uint64_t zapobj;
+ boolean_t expanded;
+
+ /*
+ * If not in global zone then make sure
+ * the zoned property is set
+ */
+ if (!INGLOBALZONE(curthread)) {
+ uint64_t zoned;
+
+ if (dsl_prop_get_dd(dd,
+ zfs_prop_to_name(ZFS_PROP_ZONED),
+ 8, 1, &zoned, NULL) != 0)
+ break;
+ if (!zoned)
+ break;
+ }
+ zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ continue;
+
+ dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+ expanded = B_FALSE;
+ for (setnode = avl_first(&permsets); setnode;
+ setnode = AVL_NEXT(&permsets, setnode)) {
+ if (setnode->p_matched == B_TRUE)
+ continue;
+
+ /* See if this set directly grants this permission */
+ error = dsl_check_access(mos, zapobj,
+ ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+ if (error == 0)
+ goto success;
+ if (error == EPERM)
+ setnode->p_matched = B_TRUE;
+
+ /* See if this set includes other sets */
+ error = dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_NAMED_SET_SETS, 0,
+ setnode->p_setname, &permsets);
+ if (error == 0)
+ setnode->p_matched = expanded = B_TRUE;
+ }
+ /*
+ * If we expanded any sets, that will define more sets,
+ * which we need to check.
+ */
+ if (expanded)
+ goto again;
+
+ error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+ if (error == 0)
+ goto success;
+ }
+ error = EPERM;
+success:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_rele(ds, FTAG);
+
+ cookie = NULL;
+ while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+ kmem_free(setnode, sizeof (perm_set_t));
+
+ return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+ boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t jumpobj, pjumpobj;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+ ZFS_DELEG_LOCAL, NULL);
+ if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+ return;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+ ZFS_DELEG_LOCAL, &uid);
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ for (zap_cursor_init(&zc, mos, pjumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zero = 0;
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+ VERIFY(zap_update(mos, jumpobj, za.za_name,
+ 8, 1, &zero, tx) == 0);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ uint64_t uid = crgetuid(cr);
+
+ if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return;
+
+ for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+ uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (pzapobj == 0)
+ continue;
+
+ copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+ copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+ }
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (zapobj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+ VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(0 == zap_destroy(mos, zapobj, tx));
+ return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+ return (os->os->os_spa->spa_delegation);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 5e563b632909..48d87f97f669 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -19,26 +19,28 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/arc.h>
+#include <sys/sunddi.h>
#include "zfs_namecheck.h"
-static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
+ cred_t *cr, dmu_tx_t *tx);
/* ARGSUSED */
@@ -55,8 +57,6 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
ASSERT(dd->dd_space_towrite[t] == 0);
}
- ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
-
if (dd->dd_parent)
dsl_dir_close(dd->dd_parent, dd);
@@ -91,9 +91,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
dmu_object_info_t doi;
dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
}
#endif
- /* XXX assert bonus buffer size is correct */
if (dd == NULL) {
dsl_dir_t *winner;
int err;
@@ -103,7 +103,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
dd->dd_phys = dbuf->db_data;
- dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
@@ -112,36 +111,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
if (dd->dd_phys->dd_parent_obj) {
err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
NULL, dd, &dd->dd_parent);
- if (err) {
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
+ if (err)
+ goto errout;
if (tail) {
#ifdef ZFS_DEBUG
uint64_t foundobj;
err = zap_lookup(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->
- dd_child_dir_zapobj,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj,
tail, sizeof (foundobj), 1, &foundobj);
ASSERT(err || foundobj == ddobj);
#endif
(void) strcpy(dd->dd_myname, tail);
} else {
err = zap_value_search(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->
- dd_child_dir_zapobj,
- ddobj, dd->dd_myname);
- }
- if (err) {
- dsl_dir_close(dd->dd_parent, dd);
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ ddobj, 0, dd->dd_myname);
}
+ if (err)
+ goto errout;
} else {
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
}
@@ -174,6 +162,15 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
ASSERT3P(dd->dd_dbuf, ==, dbuf);
*ddp = dd;
return (0);
+
+errout:
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+
}
void
@@ -404,27 +401,37 @@ dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
}
uint64_t
-dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+ dmu_tx_t *tx)
{
- objset_t *mos = pds->dd_pool->dp_meta_objset;
+ objset_t *mos = dp->dp_meta_objset;
uint64_t ddobj;
dsl_dir_phys_t *dsphys;
dmu_buf_t *dbuf;
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
- VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
- name, sizeof (uint64_t), 1, &ddobj, tx));
+ if (pds) {
+ VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ } else {
+ /* it's the root dir */
+ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+ }
VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
dsphys->dd_creation_time = gethrestime_sec();
- dsphys->dd_parent_obj = pds->dd_object;
+ if (pds)
+ dsphys->dd_parent_obj = pds->dd_object;
dsphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
dsphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
+ dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
@@ -461,23 +468,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t val, obj;
+ dd_used_t t;
ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
/* Remove our reservation. */
val = 0;
- dsl_dir_set_reservation_sync(dd, &val, tx);
- ASSERT3U(dd->dd_used_bytes, ==, 0);
+ dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+ ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+ for (t = 0; t < DD_USED_NUM; t++)
+ ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0);
VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+ VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
VERIFY(0 == zap_remove(mos,
dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
@@ -486,65 +497,53 @@ dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
VERIFY(0 == dmu_object_free(mos, obj, tx));
}
-void
-dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
{
- dsl_dir_phys_t *dsp;
- dmu_buf_t *dbuf;
- int error;
-
- *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
- DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-
- error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
- sizeof (uint64_t), 1, ddobjp, tx);
- ASSERT3U(error, ==, 0);
-
- VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsp = dbuf->db_data;
-
- dsp->dd_creation_time = gethrestime_sec();
- dsp->dd_props_zapobj = zap_create(mos,
- DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsp->dd_child_dir_zapobj = zap_create(mos,
- DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-
- dmu_buf_rele(dbuf, FTAG);
+ return (dd->dd_phys->dd_origin_obj &&
+ (dd->dd_pool->dp_origin_snap == NULL ||
+ dd->dd_phys->dd_origin_obj !=
+ dd->dd_pool->dp_origin_snap->ds_object));
}
void
dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
{
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
- dsl_dir_space_available(dd, NULL, 0, TRUE));
-
mutex_enter(&dd->dd_lock);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
- dd->dd_phys->dd_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ dd->dd_phys->dd_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
dd->dd_phys->dd_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
(dd->dd_phys->dd_uncompressed_bytes * 100 /
dd->dd_phys->dd_compressed_bytes));
+ if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
+ dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
+ dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
+ dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
+ dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
+ dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+ }
mutex_exit(&dd->dd_lock);
- if (dd->dd_phys->dd_clone_parent_obj) {
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ if (dsl_dir_is_clone(dd)) {
dsl_dataset_t *ds;
char buf[MAXNAMELEN];
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_NONE, FTAG, &ds));
+ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+ dd->dd_phys->dd_origin_obj, FTAG, &ds));
dsl_dataset_name(ds, buf);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
-
+ dsl_dataset_rele(ds, FTAG);
dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
}
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
}
void
@@ -580,7 +579,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
- dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
mutex_exit(&dd->dd_lock);
/* release the hold from dsl_dir_dirty */
@@ -588,15 +586,13 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
}
static uint64_t
-dsl_dir_estimated_space(dsl_dir_t *dd)
+dsl_dir_space_towrite(dsl_dir_t *dd)
{
- int64_t space;
+ uint64_t space = 0;
int i;
ASSERT(MUTEX_HELD(&dd->dd_lock));
- space = dd->dd_phys->dd_used_bytes;
- ASSERT(space >= 0);
for (i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i&TXG_MASK];
ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
@@ -630,13 +626,9 @@ dsl_dir_space_available(dsl_dir_t *dd,
mutex_enter(&dd->dd_lock);
if (dd->dd_phys->dd_quota != 0)
quota = dd->dd_phys->dd_quota;
- if (ondiskonly) {
- used = dd->dd_used_bytes;
- } else {
- used = dsl_dir_estimated_space(dd);
- }
- if (dd == ancestor)
- used += delta;
+ used = dd->dd_phys->dd_used_bytes;
+ if (!ondiskonly)
+ used += dsl_dir_space_towrite(dd);
if (dd->dd_parent == NULL) {
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
@@ -651,6 +643,14 @@ dsl_dir_space_available(dsl_dir_t *dd,
parentspace += dd->dd_phys->dd_reserved - used;
}
+ if (dd == ancestor) {
+ ASSERT(delta <= 0);
+ ASSERT(used >= -delta);
+ used += delta;
+ if (parentspace != UINT64_MAX)
+ parentspace -= delta;
+ }
+
if (used > quota) {
/* over quota */
myspace = 0;
@@ -678,50 +678,68 @@ dsl_dir_space_available(dsl_dir_t *dd,
struct tempreserve {
list_node_t tr_node;
+ dsl_pool_t *tr_dp;
dsl_dir_t *tr_ds;
uint64_t tr_size;
};
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
- uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+ boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
+ dmu_tx_t *tx, boolean_t first)
{
uint64_t txg = tx->tx_txg;
- uint64_t est_used, quota, parent_rsrv;
- int edquot = EDQUOT;
+ uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+ struct tempreserve *tr;
+ int enospc = EDQUOT;
int txgidx = txg & TXG_MASK;
int i;
- struct tempreserve *tr;
+ uint64_t ref_rsrv = 0;
ASSERT3U(txg, !=, 0);
- ASSERT3S(asize, >=, 0);
+ ASSERT3S(asize, >, 0);
mutex_enter(&dd->dd_lock);
+
/*
* Check against the dsl_dir's quota. We don't add in the delta
* when checking for over-quota because they get one free hit.
*/
- est_used = dsl_dir_estimated_space(dd);
+ est_inflight = dsl_dir_space_towrite(dd);
for (i = 0; i < TXG_SIZE; i++)
- est_used += dd->dd_tempreserved[i];
+ est_inflight += dd->dd_tempreserved[i];
+ used_on_disk = dd->dd_phys->dd_used_bytes;
- quota = UINT64_MAX;
+ /*
+ * On the first iteration, fetch the dataset's used-on-disk and
+ * refreservation values. Also, if checkrefquota is set, test if
+ * allocating this space would exceed the dataset's refquota.
+ */
+ if (first && tx->tx_objset) {
+ int error;
+ dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+
+ error = dsl_dataset_check_quota(ds, checkrefquota,
+ asize, est_inflight, &used_on_disk, &ref_rsrv);
+ if (error) {
+ mutex_exit(&dd->dd_lock);
+ return (error);
+ }
+ }
- if (dd->dd_phys->dd_quota)
+ /*
+ * If this transaction will result in a net free of space,
+ * we want to let it through.
+ */
+ if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+ quota = UINT64_MAX;
+ else
quota = dd->dd_phys->dd_quota;
/*
- * If this transaction will result in a net free of space, we want
- * to let it through, but we have to be careful: the space that it
- * frees won't become available until *after* this txg syncs.
- * Therefore, to ensure that it's possible to remove files from
- * a full pool without inducing transient overcommits, we throttle
+ * Adjust the quota against the actual pool size at the root.
+ * To ensure that it's possible to remove files from a full
+ * pool without inducing transient overcommits, we throttle
* netfree transactions against a quota that is slightly larger,
* but still within the pool's allocation slop. In cases where
* we're very close to full, this will allow a steady trickle of
@@ -731,47 +749,45 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
if (poolsize < quota) {
quota = poolsize;
- edquot = ENOSPC;
+ enospc = ENOSPC;
}
- } else if (netfree) {
- quota = UINT64_MAX;
}
/*
* If they are requesting more space, and our current estimate
- * is over quota. They get to try again unless the actual
+ * is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
- if (asize > 0 && est_used > quota) {
- if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
- dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
- dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
- dd->dd_used_bytes < quota)
- edquot = ERESTART;
- dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+ if (used_on_disk + est_inflight > quota) {
+ if (est_inflight > 0 || used_on_disk < quota)
+ enospc = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
"quota=%lluK tr=%lluK err=%d\n",
- dd->dd_used_bytes>>10, est_used>>10,
- quota>>10, asize>>10, edquot);
+ used_on_disk>>10, est_inflight>>10,
+ quota>>10, asize>>10, enospc);
mutex_exit(&dd->dd_lock);
- return (edquot);
+ return (enospc);
}
/* We need to up our estimated delta before dropping dd_lock */
dd->dd_tempreserved[txgidx] += asize;
- parent_rsrv = parent_delta(dd, est_used, asize);
+ parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+ asize - ref_rsrv);
mutex_exit(&dd->dd_lock);
- tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
tr->tr_ds = dd;
tr->tr_size = asize;
list_insert_tail(tr_list, tr);
/* see if it's OK with our parent */
if (dd->dd_parent && parent_rsrv) {
+ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
return (dsl_dir_tempreserve_impl(dd->dd_parent,
- parent_rsrv, netfree, tr_list, tx));
+ parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
} else {
return (0);
}
@@ -779,42 +795,62 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
/*
* Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
*/
int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
- uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+ uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
{
- int err = 0;
+ int err;
list_t *tr_list;
+ if (asize == 0) {
+ *tr_cookiep = NULL;
+ return (0);
+ }
+
tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
list_create(tr_list, sizeof (struct tempreserve),
offsetof(struct tempreserve, tr_node));
- ASSERT3S(asize, >=, 0);
+ ASSERT3S(asize, >, 0);
ASSERT3S(fsize, >=, 0);
- err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
- tr_list, tx);
-
+ err = arc_tempreserve_space(lsize, tx->tx_txg);
if (err == 0) {
struct tempreserve *tr;
- err = arc_tempreserve_space(lsize);
- if (err == 0) {
- tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
- tr->tr_ds = NULL;
- tr->tr_size = lsize;
- list_insert_tail(tr_list, tr);
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+
+ err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
+ } else {
+ if (err == EAGAIN) {
+ txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ err = ERESTART;
}
+ dsl_pool_memory_pressure(dd->dd_pool);
+ }
+
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_dp = dd->dd_pool;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ FALSE, asize > usize, tr_list, tx, TRUE);
}
if (err)
dsl_dir_tempreserve_clear(tr_list, tx);
else
*tr_cookiep = tr_list;
+
return (err);
}
@@ -831,15 +867,20 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
ASSERT3U(tx->tx_txg, !=, 0);
+ if (tr_cookie == NULL)
+ return;
+
while (tr = list_head(tr_list)) {
- if (tr->tr_ds == NULL) {
- arc_tempreserve_clear(tr->tr_size);
- } else {
+ if (tr->tr_dp) {
+ dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
+ } else if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
tr->tr_size);
tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
mutex_exit(&tr->tr_ds->dd_lock);
+ } else {
+ arc_tempreserve_clear(tr->tr_size);
}
list_remove(tr_list, tr);
kmem_free(tr, sizeof (struct tempreserve));
@@ -848,13 +889,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
kmem_free(tr_list, sizeof (list_t));
}
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data. Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+static void
+dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
{
int64_t parent_space;
uint64_t est_used;
@@ -863,7 +899,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
if (space > 0)
dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
- est_used = dsl_dir_estimated_space(dd);
+ est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
parent_space = parent_delta(dd, est_used, space);
mutex_exit(&dd->dd_lock);
@@ -872,39 +908,96 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
/* XXX this is potentially expensive and unnecessary... */
if (parent_space && dd->dd_parent)
- dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+ dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data. Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ dsl_pool_willuse_space(dd->dd_pool, space, tx);
+ dsl_dir_willuse_space_impl(dd, space, tx);
}
/* call from syncing context when we actually write/free space for this dd */
void
-dsl_dir_diduse_space(dsl_dir_t *dd,
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
{
int64_t accounted_delta;
+ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(type < DD_USED_NUM);
dsl_dir_dirty(dd, tx);
- mutex_enter(&dd->dd_lock);
- accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
- ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+ if (needlock)
+ mutex_enter(&dd->dd_lock);
+ accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
+ ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
ASSERT(compressed >= 0 ||
dd->dd_phys->dd_compressed_bytes >= -compressed);
ASSERT(uncompressed >= 0 ||
dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
- dd->dd_used_bytes += used;
+ dd->dd_phys->dd_used_bytes += used;
dd->dd_phys->dd_uncompressed_bytes += uncompressed;
dd->dd_phys->dd_compressed_bytes += compressed;
- mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ ASSERT(used > 0 ||
+ dd->dd_phys->dd_used_breakdown[type] >= -used);
+ dd->dd_phys->dd_used_breakdown[type] += used;
+#ifdef DEBUG
+ dd_used_t t;
+ uint64_t u = 0;
+ for (t = 0; t < DD_USED_NUM; t++)
+ u += dd->dd_phys->dd_used_breakdown[t];
+ ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+#endif
+ }
+ if (needlock)
+ mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
- dsl_dir_diduse_space(dd->dd_parent,
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
accounted_delta, compressed, uncompressed, tx);
+ dsl_dir_transfer_space(dd->dd_parent,
+ used - accounted_delta,
+ DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
}
}
-/* ARGSUSED */
+void
+dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(oldtype < DD_USED_NUM);
+ ASSERT(newtype < DD_USED_NUM);
+
+ if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+ return;
+
+ dsl_dir_dirty(dd, tx);
+ if (needlock)
+ mutex_enter(&dd->dd_lock);
+ ASSERT(delta > 0 ?
+ dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
+ dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
+ ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
+ dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
+ dd->dd_phys->dd_used_breakdown[newtype] += delta;
+ if (needlock)
+ mutex_exit(&dd->dd_lock);
+}
+
static int
dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -921,22 +1014,22 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* If we are doing the preliminary check in open context, and
* there are pending changes, then don't fail it, since the
- * pending changes could under-estimat the amount of space to be
+ * pending changes could under-estimate the amount of space to be
* freed up.
*/
- towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
- dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
+ towrite = dsl_dir_space_towrite(dd);
if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
(new_quota < dd->dd_phys->dd_reserved ||
- new_quota < dsl_dir_estimated_space(dd))) {
+ new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
err = ENOSPC;
}
mutex_exit(&dd->dd_lock);
return (err);
}
+/* ARGSUSED */
static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
uint64_t *quotap = arg2;
@@ -947,6 +1040,10 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
mutex_enter(&dd->dd_lock);
dd->dd_phys->dd_quota = new_quota;
mutex_exit(&dd->dd_lock);
+
+ spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu ",
+ (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
}
int
@@ -958,20 +1055,22 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota)
err = dsl_dir_open(ddname, FTAG, &dd, NULL);
if (err)
return (err);
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(dd->dd_pool, 0);
- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, dd, &quota, 0);
+ if (quota != dd->dd_phys->dd_quota) {
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, dd, &quota, 0);
+ }
dsl_dir_close(dd, FTAG);
return (err);
}
-/* ARGSUSED */
-static int
+int
dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
@@ -991,7 +1090,7 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
mutex_enter(&dd->dd_lock);
- used = dd->dd_used_bytes;
+ used = dd->dd_phys->dd_used_bytes;
delta = MAX(used, new_reservation) -
MAX(used, dd->dd_phys->dd_reserved);
mutex_exit(&dd->dd_lock);
@@ -1011,8 +1110,9 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
+/* ARGSUSED */
static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
uint64_t *reservationp = arg2;
@@ -1020,19 +1120,24 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
uint64_t used;
int64_t delta;
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
mutex_enter(&dd->dd_lock);
- used = dd->dd_used_bytes;
+ used = dd->dd_phys->dd_used_bytes;
delta = MAX(used, new_reservation) -
MAX(used, dd->dd_phys->dd_reserved);
- mutex_exit(&dd->dd_lock);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_reserved = new_reservation;
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
- dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ delta, 0, 0, tx);
}
+ mutex_exit(&dd->dd_lock);
+
+ spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu",
+ (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
}
int
@@ -1074,7 +1179,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
return (delta);
mutex_enter(&dd->dd_lock);
- delta = parent_delta(dd, dd->dd_used_bytes, delta);
+ delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
mutex_exit(&dd->dd_lock);
return (would_change(dd->dd_parent, delta, ancestor));
}
@@ -1084,7 +1189,7 @@ struct renamearg {
const char *mynewname;
};
-/* ARGSUSED */
+/*ARGSUSED*/
static int
dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1110,7 +1215,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (ra->newparent != dd->dd_parent) {
/* is there enough space? */
uint64_t myspace =
- MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+ MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
/* no rename into our descendant */
if (closest_common_ancestor(dd, ra->newparent) == dd)
@@ -1125,7 +1230,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct renamearg *ra = arg2;
@@ -1136,15 +1241,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
if (ra->newparent != dd->dd_parent) {
- uint64_t myspace =
- MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
- dsl_dir_diduse_space(dd->dd_parent, -myspace,
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+ -dd->dd_phys->dd_used_bytes,
-dd->dd_phys->dd_compressed_bytes,
-dd->dd_phys->dd_uncompressed_bytes, tx);
- dsl_dir_diduse_space(ra->newparent, myspace,
+ dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
+ dd->dd_phys->dd_used_bytes,
dd->dd_phys->dd_compressed_bytes,
dd->dd_phys->dd_uncompressed_bytes, tx);
+
+ if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
+ uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
+ dd->dd_phys->dd_used_bytes;
+
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ -unused_rsrv, 0, 0, tx);
+ dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
+ unused_rsrv, 0, 0, tx);
+ }
}
dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1164,6 +1278,9 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
dd->dd_myname, 8, 1, &dd->dd_object, tx);
ASSERT3U(err, ==, 0);
+
+ spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
}
int
@@ -1189,7 +1306,6 @@ dsl_dir_rename(dsl_dir_t *dd, const char *newname)
goto out;
}
-
err = dsl_sync_task_do(dd->dd_pool,
dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 00abf7ec2c6b..4585dc805fe5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
@@ -36,20 +34,36 @@
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+
+int zfs_no_write_throttle = 0;
+int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
+int zfs_txg_synctime = 5; /* target secs to sync a txg */
+
+uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
+uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
+uint64_t zfs_write_limit_inflated = 0;
+uint64_t zfs_write_limit_override = 0;
+extern uint64_t zfs_write_limit_min;
+
+kmutex_t zfs_write_limit_lock;
+
+static pgcnt_t old_physmem = 0;
static int
-dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
uint64_t obj;
int err;
err = zap_lookup(dp->dp_meta_objset,
dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
- MOS_DIR_NAME, sizeof (obj), 1, &obj);
+ name, sizeof (obj), 1, &obj);
if (err)
return (err);
- return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+ return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
}
static dsl_pool_t *
@@ -62,6 +76,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_spa = spa;
dp->dp_meta_rootbp = *bp;
rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+ dp->dp_write_limit = zfs_write_limit_min;
txg_init(dp, txg);
txg_list_create(&dp->dp_dirty_datasets,
@@ -70,9 +85,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_dir_t, dd_dirty_link));
txg_list_create(&dp->dp_sync_tasks,
offsetof(dsl_sync_task_group_t, dstg_node));
- list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+ list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
offsetof(dsl_dataset_t, ds_synced_link));
+ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
+
return (dp);
}
@@ -81,9 +99,11 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
objset_impl_t *osi;
- rw_enter(&dp->dp_config_rwlock, RW_READER);
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
if (err)
goto out;
@@ -100,10 +120,73 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
if (err)
goto out;
- err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+ err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
if (err)
goto out;
+ if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+ err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+ if (err)
+ goto out;
+ err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
+ FTAG, &ds);
+ if (err)
+ goto out;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ dp, &dp->dp_origin_snap);
+ if (err)
+ goto out;
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dir_close(dd, dp);
+ }
+
+ /* get scrub status */
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+ &dp->dp_scrub_func);
+ if (err == 0) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+ &dp->dp_scrub_queue_obj);
+ if (err)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+ &dp->dp_scrub_min_txg);
+ if (err)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+ &dp->dp_scrub_max_txg);
+ if (err)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ &dp->dp_scrub_bookmark);
+ if (err)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+ &spa->spa_scrub_errors);
+ if (err)
+ goto out;
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool. Restart from the beginning, since the
+ * old software may have changed the pool in the
+ * meantime.
+ */
+ dsl_pool_scrub_restart(dp);
+ }
+ } else {
+ /*
+ * It's OK if there is no scrub in progress (and if
+ * there was an I/O error, ignore it).
+ */
+ err = 0;
+ }
+
out:
rw_exit(&dp->dp_config_rwlock);
if (err)
@@ -117,7 +200,15 @@ out:
void
dsl_pool_close(dsl_pool_t *dp)
{
- /* drop our reference from dsl_pool_open() */
+ /* drop our references from dsl_pool_open() */
+
+ /*
+ * Since we held the origin_snap from "syncing" context (which
+ * includes pool-opening context), it actually only got a "ref"
+ * and not a hold, so just drop that here.
+ */
+ if (dp->dp_origin_snap)
+ dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
if (dp->dp_mos_dir)
dsl_dir_close(dp->dp_mos_dir, dp);
if (dp->dp_root_dir)
@@ -130,20 +221,27 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_dirty_datasets);
txg_list_destroy(&dp->dp_dirty_dirs);
txg_list_destroy(&dp->dp_sync_tasks);
- list_destroy(&dp->dp_synced_objsets);
+ list_destroy(&dp->dp_synced_datasets);
- arc_flush();
+ arc_flush(dp->dp_spa);
txg_fini(dp);
rw_destroy(&dp->dp_config_rwlock);
+ mutex_destroy(&dp->dp_lock);
+ mutex_destroy(&dp->dp_scrub_cancel_lock);
kmem_free(dp, sizeof (dsl_pool_t));
}
dsl_pool_t *
-dsl_pool_create(spa_t *spa, uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ objset_impl_t *osip;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ /* create and open the MOS (meta-objset) */
dp->dp_meta_objset = &dmu_objset_create_impl(spa,
NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
@@ -153,13 +251,29 @@ dsl_pool_create(spa_t *spa, uint64_t txg)
ASSERT3U(err, ==, 0);
/* create and open the root dir */
- dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
NULL, dp, &dp->dp_root_dir));
/* create and open the meta-objset dir */
- (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
- VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ MOS_DIR_NAME, &dp->dp_mos_dir));
+
+ if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+ dsl_pool_create_origin(dp, tx);
+
+ /* create the root dataset */
+ dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+
+ /* create the root objset */
+ VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ osip = dmu_objset_create_impl(dp->dp_spa, ds,
+ dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+#ifdef _KERNEL
+ zfs_create_fs(&osip->os, kcred, zplprops, tx);
+#endif
+ dsl_dataset_rele(ds, FTAG);
dmu_tx_commit(tx);
@@ -175,26 +289,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dataset_t *ds;
dsl_sync_task_group_t *dstg;
objset_impl_t *mosi = dp->dp_meta_objset->os;
+ hrtime_t start, write_time;
+ uint64_t data_written;
int err;
tx = dmu_tx_create_assigned(dp, txg);
+ dp->dp_read_overhead = 0;
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
if (!list_link_active(&ds->ds_synced_link))
- list_insert_tail(&dp->dp_synced_objsets, ds);
+ list_insert_tail(&dp->dp_synced_datasets, ds);
else
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
}
+ DTRACE_PROBE(pool_sync__1setup);
+
+ start = gethrtime();
err = zio_wait(zio);
+ write_time = gethrtime() - start;
ASSERT(err == 0);
+ DTRACE_PROBE(pool_sync__2rootzio);
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
dsl_sync_task_group_sync(dstg, tx);
+ DTRACE_PROBE(pool_sync__3task);
+
+ start = gethrtime();
while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
dsl_dir_sync(dd, tx);
+ write_time += gethrtime() - start;
+
+ if (spa_sync_pass(dp->dp_spa) == 1)
+ dsl_pool_scrub_sync(dp, tx);
+ start = gethrtime();
if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -204,8 +334,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
}
+ write_time += gethrtime() - start;
+ DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
+ hrtime_t, dp->dp_read_overhead);
+ write_time -= dp->dp_read_overhead;
dmu_tx_commit(tx);
+
+ data_written = dp->dp_space_towrite[txg & TXG_MASK];
+ dp->dp_space_towrite[txg & TXG_MASK] = 0;
+ ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
+
+ /*
+ * If the write limit max has not been explicitly set, set it
+ * to a fraction of available physical memory (default 1/8th).
+ * Note that we must inflate the limit because the spa
+ * inflates write sizes to account for data replication.
+ * Check this each sync phase to catch changing memory size.
+ */
+ if (physmem != old_physmem && zfs_write_limit_shift) {
+ mutex_enter(&zfs_write_limit_lock);
+ old_physmem = physmem;
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ zfs_write_limit_inflated = MAX(zfs_write_limit_min,
+ spa_get_asize(dp->dp_spa, zfs_write_limit_max));
+ mutex_exit(&zfs_write_limit_lock);
+ }
+
+ /*
+ * Attempt to keep the sync time consistent by adjusting the
+ * amount of write traffic allowed into each transaction group.
+ * Weight the throughput calculation towards the current value:
+ * thru = 3/4 old_thru + 1/4 new_thru
+ */
+ ASSERT(zfs_write_limit_min > 0);
+ if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
+ uint64_t throughput = (data_written * NANOSEC) / write_time;
+ if (dp->dp_throughput)
+ dp->dp_throughput = throughput / 4 +
+ 3 * dp->dp_throughput / 4;
+ else
+ dp->dp_throughput = throughput;
+ dp->dp_write_limit = MIN(zfs_write_limit_inflated,
+ MAX(zfs_write_limit_min,
+ dp->dp_throughput * zfs_txg_synctime));
+ }
}
void
@@ -213,8 +386,8 @@ dsl_pool_zil_clean(dsl_pool_t *dp)
{
dsl_dataset_t *ds;
- while (ds = list_head(&dp->dp_synced_objsets)) {
- list_remove(&dp->dp_synced_objsets, ds);
+ while (ds = list_head(&dp->dp_synced_datasets)) {
+ list_remove(&dp->dp_synced_datasets, ds);
ASSERT(ds->ds_user_ptr != NULL);
zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
dmu_buf_rele(ds->ds_dbuf, ds);
@@ -254,3 +427,187 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
return (space - resv);
}
+
+int
+dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+{
+ uint64_t reserved = 0;
+ uint64_t write_limit = (zfs_write_limit_override ?
+ zfs_write_limit_override : dp->dp_write_limit);
+
+ if (zfs_no_write_throttle) {
+ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
+ space);
+ return (0);
+ }
+
+ /*
+ * Check to see if we have exceeded the maximum allowed IO for
+ * this transaction group. We can do this without locks since
+ * a little slop here is ok. Note that we do the reserved check
+ * with only half the requested reserve: this is because the
+ * reserve requests are worst-case, and we really don't want to
+ * throttle based off of worst-case estimates.
+ */
+ if (write_limit > 0) {
+ reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
+ + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
+
+ if (reserved && reserved > write_limit)
+ return (ERESTART);
+ }
+
+ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
+
+ /*
+ * If this transaction group is over 7/8ths capacity, delay
+ * the caller 1 clock tick. This will slow down the "fill"
+ * rate until the sync process can catch up with us.
+ */
+ if (reserved && reserved > (write_limit - (write_limit >> 3)))
+ txg_delay(dp, tx->tx_txg, 1);
+
+ return (0);
+}
+
+void
+dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
+ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+}
+
+void
+dsl_pool_memory_pressure(dsl_pool_t *dp)
+{
+ uint64_t space_inuse = 0;
+ int i;
+
+ if (dp->dp_write_limit == zfs_write_limit_min)
+ return;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ space_inuse += dp->dp_space_towrite[i];
+ space_inuse += dp->dp_tempreserved[i];
+ }
+ dp->dp_write_limit = MAX(zfs_write_limit_min,
+ MIN(dp->dp_write_limit, space_inuse / 4));
+}
+
+void
+dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ if (space > 0) {
+ mutex_enter(&dp->dp_lock);
+ dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
+ mutex_exit(&dp->dp_lock);
+ }
+}
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds, *prev = NULL;
+ int err;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+ break;
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ prev = NULL;
+ }
+
+ if (prev == NULL) {
+ prev = dp->dp_origin_snap;
+
+ /*
+ * The $ORIGIN can't have any data, or the accounting
+ * will be wrong.
+ */
+ ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
+
+ /* The origin doesn't get attached to itself */
+ if (ds->ds_object == prev->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
+ ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+
+ dmu_buf_will_dirty(prev->ds_dbuf, tx);
+ prev->ds_phys->ds_num_children++;
+
+ if (ds->ds_phys->ds_next_snap_obj == 0) {
+ ASSERT(ds->ds_prev == NULL);
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+ }
+ }
+
+ ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
+ ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
+
+ if (prev->ds_phys->ds_next_clones_obj == 0) {
+ prev->ds_phys->ds_next_clones_obj =
+ zap_create(dp->dp_meta_objset,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+ prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+
+ dsl_dataset_rele(ds, FTAG);
+ if (prev != dp->dp_origin_snap)
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap != NULL);
+
+ (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+ tx, DS_FIND_CHILDREN);
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t dsobj;
+ dsl_dataset_t *ds;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap == NULL);
+
+ /* create the origin dir, ds, & snap-ds */
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+ NULL, 0, kcred, tx);
+ VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+ VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ dp, &dp->dp_origin_snap));
+ dsl_dataset_rele(ds, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
index 2fff66d06b1e..212acbbc5968 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,14 +44,20 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
{
zfs_prop_t prop;
- if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
- zfs_prop_readonly(prop))
+ /*
+ * The setonce properties are read-only, BUT they still
+ * have a default value that can be used as the initial
+ * value.
+ */
+ if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+ (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
return (ENOENT);
- if (zfs_prop_get_type(prop) == prop_type_string) {
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
if (intsz != 1)
return (EOVERFLOW);
- (void) strncpy(buf, zfs_prop_default_string(prop), numint);
+ (void) strncpy(buf, zfs_prop_default_string(prop),
+ numint);
} else {
if (intsz != 8 || numint < 1)
return (EOVERFLOW);
@@ -62,13 +68,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
return (0);
}
-static int
-dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
+int
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
int intsz, int numint, void *buf, char *setpoint)
{
int err = ENOENT;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
zfs_prop_t prop;
+ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
if (setpoint)
setpoint[0] = '\0';
@@ -79,7 +88,6 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
* ouside this loop.
*/
for (; dd != NULL; dd = dd->dd_parent) {
- objset_t *mos = dd->dd_pool->dp_meta_objset;
ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
propname, intsz, numint, buf);
@@ -92,8 +100,7 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
/*
* Break out of this loop for non-inheritable properties.
*/
- if (prop != ZFS_PROP_INVAL &&
- !zfs_prop_inheritable(prop))
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
break;
}
if (err == ENOENT)
@@ -102,6 +109,26 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
return (err);
}
+int
+dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
+ int intsz, int numint, void *buf, char *setpoint)
+{
+ ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+
+ if (ds->ds_phys->ds_props_obj) {
+ int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+ if (err != ENOENT) {
+ if (setpoint)
+ dsl_dataset_name(ds, setpoint);
+ return (err);
+ }
+ }
+
+ return (dsl_prop_get_dd(ds->ds_dir, propname,
+ intsz, numint, buf, setpoint));
+}
+
/*
* Register interest in the named property. We'll call the callback
* once to notify it of the current property value, and again each time
@@ -114,18 +141,20 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg)
{
dsl_dir_t *dd = ds->ds_dir;
+ dsl_pool_t *dp = dd->dd_pool;
uint64_t value;
dsl_prop_cb_record_t *cbr;
int err;
int need_rwlock;
- need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
+ need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock);
if (need_rwlock)
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
- err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
+ err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL);
if (err != 0) {
- rw_exit(&dd->dd_pool->dp_config_rwlock);
+ if (need_rwlock)
+ rw_exit(&dp->dp_config_rwlock);
return (err);
}
@@ -141,46 +170,30 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr->cbr_func(cbr->cbr_arg, value);
- VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
NULL, cbr, &dd));
if (need_rwlock)
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- /* Leave dataset open until this callback is unregistered */
+ rw_exit(&dp->dp_config_rwlock);
+ /* Leave dir open until this callback is unregistered */
return (0);
}
int
-dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
- int intsz, int numints, void *buf, char *setpoint)
-{
- int err;
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
-
- return (err);
-}
-
-int
-dsl_prop_get(const char *ddname, const char *propname,
+dsl_prop_get(const char *dsname, const char *propname,
int intsz, int numints, void *buf, char *setpoint)
{
- dsl_dir_t *dd;
- const char *tail;
+ dsl_dataset_t *ds;
int err;
- err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
- if (tail && tail[0] != '@') {
- dsl_dir_close(dd, FTAG);
- return (ENOENT);
- }
- err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint);
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
- dsl_dir_close(dd, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -264,8 +277,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
dsl_prop_cb_record_t *cbr;
objset_t *mos = dp->dp_meta_objset;
zap_cursor_t zc;
- zap_attribute_t za;
+ zap_attribute_t *za;
int err;
+ uint64_t dummyval;
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -278,7 +292,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
* being inherited here or below; stop the recursion.
*/
err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
- 8, 1, &value);
+ 8, 1, &dummyval);
if (err == 0) {
dsl_dir_close(dd, FTAG);
return;
@@ -287,22 +301,34 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
}
mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs);
- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (strcmp(cbr->cbr_propname, propname) == 0) {
- cbr->cbr_func(cbr->cbr_arg, value);
- }
+ for (cbr = list_head(&dd->dd_prop_cbs); cbr;
+ cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+
+ if (strcmp(cbr->cbr_propname, propname) != 0)
+ continue;
+
+ /*
+ * If the property is set on this ds, then it is not
+ * inherited here; don't call the callback.
+ */
+ if (propobj && 0 == zap_lookup(mos, propobj, propname,
+ 8, 1, &dummyval))
+ continue;
+
+ cbr->cbr_func(cbr->cbr_arg, value);
}
mutex_exit(&dd->dd_lock);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
for (zap_cursor_init(&zc, mos,
dd->dd_phys->dd_child_dir_zapobj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_retrieve(&zc, za) == 0;
zap_cursor_advance(&zc)) {
- /* XXX recursion could blow stack; esp. za! */
- dsl_prop_changed_notify(dp, za.za_first_integer,
+ dsl_prop_changed_notify(dp, za->za_first_integer,
propname, value, FALSE);
}
+ kmem_free(za, sizeof (zap_attribute_t));
zap_cursor_fini(&zc);
dsl_dir_close(dd, FTAG);
}
@@ -316,22 +342,37 @@ struct prop_set_arg {
static void
-dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
+ dsl_dataset_t *ds = arg1;
struct prop_set_arg *psa = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
- uint64_t intval;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj, intval;
int isint;
+ char valbuf[32];
+ char *valstr;
isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+ if (dsl_dataset_is_snapshot(ds)) {
+ ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_SNAP_PROPS);
+ if (ds->ds_phys->ds_props_obj == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_props_obj =
+ zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ }
+ zapobj = ds->ds_phys->ds_props_obj;
+ } else {
+ zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+ }
+
if (psa->numints == 0) {
int err = zap_remove(mos, zapobj, psa->name, tx);
ASSERT(err == 0 || err == ENOENT);
if (isint) {
- VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
+ VERIFY(0 == dsl_prop_get_ds(ds,
psa->name, 8, 1, &intval, NULL));
}
} else {
@@ -342,32 +383,63 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
}
if (isint) {
- dsl_prop_changed_notify(dd->dd_pool,
- dd->dd_object, psa->name, intval, TRUE);
+ if (dsl_dataset_is_snapshot(ds)) {
+ dsl_prop_cb_record_t *cbr;
+ /*
+ * It's a snapshot; nothing can inherit this
+ * property, so just look for callbacks on this
+ * ds here.
+ */
+ mutex_enter(&ds->ds_dir->dd_lock);
+ for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
+ cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds &&
+ strcmp(cbr->cbr_propname, psa->name) == 0)
+ cbr->cbr_func(cbr->cbr_arg, intval);
+ }
+ mutex_exit(&ds->ds_dir->dd_lock);
+ } else {
+ dsl_prop_changed_notify(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_object, psa->name, intval, TRUE);
+ }
+ }
+ if (isint) {
+ (void) snprintf(valbuf, sizeof (valbuf),
+ "%lld", (longlong_t)intval);
+ valstr = valbuf;
+ } else {
+ valstr = (char *)psa->buf;
}
+ spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
+ LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
+ "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
}
-int
-dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
- int intsz, int numints, const void *buf)
+void
+dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ cred_t *cr, dmu_tx_t *tx)
{
- struct prop_set_arg psa;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
- psa.name = propname;
- psa.intsz = intsz;
- psa.numints = numints;
- psa.buf = buf;
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx));
+
+ dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
- return (dsl_sync_task_do(dd->dd_pool,
- NULL, dsl_prop_set_sync, dd, &psa, 2));
+ spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ "%s=%llu dataset = %llu", name, (u_longlong_t)val,
+ dd->dd_phys->dd_head_dataset_obj);
}
int
-dsl_prop_set(const char *ddname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname,
int intsz, int numints, const void *buf)
{
- dsl_dir_t *dd;
+ dsl_dataset_t *ds;
int err;
+ struct prop_set_arg psa;
/*
* We must do these checks before we get to the syncfunc, since
@@ -378,11 +450,24 @@ dsl_prop_set(const char *ddname, const char *propname,
if (intsz * numints >= ZAP_MAXVALUELEN)
return (E2BIG);
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
- err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
- dsl_dir_close(dd, FTAG);
+
+ if (dsl_dataset_is_snapshot(ds) &&
+ spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOTSUP);
+ }
+
+ psa.name = propname;
+ psa.intsz = intsz;
+ psa.numints = numints;
+ psa.buf = buf;
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ NULL, dsl_prop_set_sync, ds, &psa, 2);
+
+ dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -390,45 +475,55 @@ dsl_prop_set(const char *ddname, const char *propname,
* Iterate over all properties for this dataset and return them in an nvlist.
*/
int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
{
dsl_dataset_t *ds = os->os->os_dsl_dataset;
dsl_dir_t *dd = ds->ds_dir;
+ boolean_t snapshot = dsl_dataset_is_snapshot(ds);
int err = 0;
- dsl_pool_t *dp;
- objset_t *mos;
-
- if (dsl_dataset_is_snapshot(ds)) {
- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- return (0);
- }
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t propobj = ds->ds_phys->ds_props_obj;
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- dp = dd->dd_pool;
- mos = dp->dp_meta_objset;
+ if (local && snapshot && !propobj)
+ return (0);
rw_enter(&dp->dp_config_rwlock, RW_READER);
- for (; dd != NULL; dd = dd->dd_parent) {
+ while (dd != NULL) {
char setpoint[MAXNAMELEN];
zap_cursor_t zc;
zap_attribute_t za;
+ dsl_dir_t *dd_next;
+
+ if (propobj) {
+ dsl_dataset_name(ds, setpoint);
+ dd_next = dd;
+ } else {
+ dsl_dir_name(dd, setpoint);
+ propobj = dd->dd_phys->dd_props_zapobj;
+ dd_next = dd->dd_parent;
+ }
- dsl_dir_name(dd, setpoint);
-
- for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
+ for (zap_cursor_init(&zc, mos, propobj);
(err = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
nvlist_t *propval;
- zfs_prop_t prop;
- /*
- * Skip non-inheritable properties.
- */
- if ((prop = zfs_name_to_prop(za.za_name)) !=
- ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
- dd != ds->ds_dir)
+ zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+
+ /* Skip non-inheritable properties. */
+ if (prop != ZPROP_INVAL &&
+ !zfs_prop_inheritable(prop) &&
+ (dd != ds->ds_dir || (snapshot && dd != dd_next)))
continue;
+ /* Skip properties not valid for this type. */
+ if (snapshot && prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+ continue;
+
+ /* Skip properties already defined */
if (nvlist_lookup_nvlist(*nvp, za.za_name,
&propval) == 0)
continue;
@@ -441,28 +536,26 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
*/
char *tmp = kmem_alloc(za.za_num_integers,
KM_SLEEP);
- err = zap_lookup(mos,
- dd->dd_phys->dd_props_zapobj,
- za.za_name, 1, za.za_num_integers,
- tmp);
+ err = zap_lookup(mos, propobj,
+ za.za_name, 1, za.za_num_integers, tmp);
if (err != 0) {
kmem_free(tmp, za.za_num_integers);
break;
}
- VERIFY(nvlist_add_string(propval,
- ZFS_PROP_VALUE, tmp) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
kmem_free(tmp, za.za_num_integers);
} else {
/*
* Integer property
*/
ASSERT(za.za_integer_length == 8);
- (void) nvlist_add_uint64(propval,
- ZFS_PROP_VALUE, za.za_first_integer);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
}
- VERIFY(nvlist_add_string(propval,
- ZFS_PROP_SOURCE, setpoint) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
+ setpoint) == 0);
VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
propval) == 0);
nvlist_free(propval);
@@ -472,6 +565,14 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
if (err != ENOENT)
break;
err = 0;
+ /*
+ * If we are just after the props that have been set
+ * locally, then we are done after the first iteration.
+ */
+ if (local)
+ break;
+ dd = dd_next;
+ propobj = 0;
}
rw_exit(&dp->dp_config_rwlock);
@@ -484,7 +585,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
nvlist_t *propval;
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
nvlist_free(propval);
}
@@ -495,7 +596,7 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
nvlist_t *propval;
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
nvlist_free(propval);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
new file mode 100644
index 000000000000..5f675b787df7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
@@ -0,0 +1,929 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+
+typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scrub_cb_t dsl_pool_scrub_clean_cb;
+static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+
+int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
+int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+
+extern int zfs_txg_timeout;
+
+static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
+ NULL,
+ dsl_pool_scrub_clean_cb
+};
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = arg1;
+ enum scrub_func *funcp = arg2;
+ dmu_object_type_t ot = 0;
+ boolean_t complete = B_FALSE;
+
+ dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
+
+ ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
+ ASSERT(*funcp > SCRUB_FUNC_NONE);
+ ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
+
+ dp->dp_scrub_min_txg = 0;
+ dp->dp_scrub_max_txg = tx->tx_txg;
+
+ if (*funcp == SCRUB_FUNC_CLEAN) {
+ vdev_t *rvd = dp->dp_spa->spa_root_vdev;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(rvd);
+
+ if (vdev_resilver_needed(rvd,
+ &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
+ spa_event_notify(dp->dp_spa, NULL,
+ ESC_ZFS_RESILVER_START);
+ dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
+ tx->tx_txg);
+ }
+
+ /* zero out the scrub stats in all vdev_stat_t's */
+ vdev_scrub_stat_update(rvd,
+ dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+ POOL_SCRUB_EVERYTHING, B_FALSE);
+
+ dp->dp_spa->spa_scrub_started = B_TRUE;
+ }
+
+ /* back to the generic stuff */
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ dp->dp_scrub_func = *funcp;
+ dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
+ bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+ dp->dp_scrub_restart = B_FALSE;
+ dp->dp_spa->spa_scrub_errors = 0;
+
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+ &dp->dp_scrub_func, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+ &dp->dp_scrub_queue_obj, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+ &dp->dp_scrub_min_txg, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+ &dp->dp_scrub_max_txg, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ &dp->dp_scrub_bookmark, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+ &dp->dp_spa->spa_scrub_errors, tx));
+
+ spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
+}
+
+int
+dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
+{
+ return (dsl_sync_task_do(dp, NULL,
+ dsl_pool_scrub_setup_sync, dp, &func, 0));
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = arg1;
+ boolean_t *completep = arg2;
+
+ if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+ return;
+
+ mutex_enter(&dp->dp_scrub_cancel_lock);
+
+ if (dp->dp_scrub_restart) {
+ dp->dp_scrub_restart = B_FALSE;
+ *completep = B_FALSE;
+ }
+
+ /* XXX this is scrub-clean specific */
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ while (dp->dp_spa->spa_scrub_inflight > 0) {
+ cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+ &dp->dp_spa->spa_scrub_lock);
+ }
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+ dp->dp_spa->spa_scrub_started = B_FALSE;
+ dp->dp_spa->spa_scrub_active = B_FALSE;
+
+ dp->dp_scrub_func = SCRUB_FUNC_NONE;
+ VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ dp->dp_scrub_queue_obj, tx));
+ dp->dp_scrub_queue_obj = 0;
+ bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_QUEUE, tx));
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MIN_TXG, tx));
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_MAX_TXG, tx));
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, tx));
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_FUNC, tx));
+ VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_ERRORS, tx));
+
+ spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
+ "complete=%u", *completep);
+
+ /* below is scrub-clean specific */
+ vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
+ *completep);
+ /*
+ * If the scrub/resilver completed, update all DTLs to reflect this.
+ * Whether it succeeded or not, vacate all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
+ *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
+ if (dp->dp_scrub_min_txg && *completep)
+ spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+ spa_errlog_rotate(dp->dp_spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
+
+ dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
+ mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+int
+dsl_pool_scrub_cancel(dsl_pool_t *dp)
+{
+ boolean_t complete = B_FALSE;
+
+ return (dsl_sync_task_do(dp, NULL,
+ dsl_pool_scrub_cancel_sync, dp, &complete, 3));
+}
+
+int
+dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+ /*
+ * This function will be used by bp-rewrite wad to intercept frees.
+ */
+ return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
+ done, private, arc_flags));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+ return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
+ const zbookmark_t *zb2)
+{
+ uint64_t zb1nextL0, zb2thisobj;
+
+ ASSERT(zb1->zb_objset == zb2->zb_objset);
+ ASSERT(zb1->zb_object != -1ULL);
+ ASSERT(zb2->zb_level == 0);
+
+ /*
+ * A bookmark in the deadlist is considered to be after
+ * everything else.
+ */
+ if (zb2->zb_object == -1ULL)
+ return (B_TRUE);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ zb1nextL0 = (zb1->zb_blkid + 1) <<
+ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+ zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+ if (zb1->zb_object == 0) {
+ uint64_t nextobj = zb1nextL0 *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ return (nextobj <= zb2thisobj);
+ }
+
+ if (zb1->zb_object < zb2thisobj)
+ return (B_TRUE);
+ if (zb1->zb_object > zb2thisobj)
+ return (B_FALSE);
+ if (zb2->zb_object == 0)
+ return (B_FALSE);
+ return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static boolean_t
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+{
+ int elapsed_ticks;
+ int mintime;
+
+ if (dp->dp_scrub_pausing)
+ return (B_TRUE); /* we're already pausing */
+
+ if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb->zb_level != 0)
+ return (B_FALSE);
+
+ mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
+ zfs_scrub_min_time;
+ elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
+ if (elapsed_ticks > hz * zfs_txg_timeout ||
+ (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
+ dprintf("pausing at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+ dp->dp_scrub_pausing = B_TRUE;
+ dp->dp_scrub_bookmark = *zb;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_traverse_arg {
+ dsl_pool_t *zta_dp;
+ zil_header_t *zta_zh;
+} zil_traverse_arg_t;
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zil_traverse_arg_t *zta = arg;
+ dsl_pool_t *dp = zta->zta_dp;
+ zil_header_t *zh = zta->zta_zh;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= dp->dp_scrub_min_txg)
+ return;
+
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ return;
+
+ zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_traverse_arg_t *zta = arg;
+ dsl_pool_t *dp = zta->zta_dp;
+ zil_header_t *zh = zta->zta_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= dp->dp_scrub_min_txg)
+ return;
+
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return;
+
+ zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = lr->lr_foid;
+ zb.zb_level = BP_GET_LEVEL(bp);
+ zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+ }
+}
+
+static void
+traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_traverse_arg_t zta = { dp, zh };
+ zilog_t *zilog;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && (spa_mode & FWRITE))
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+static void
+scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
+ arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
+{
+ int err;
+ arc_buf_t *buf = NULL;
+
+ if (bp->blk_birth == 0)
+ return;
+
+ if (bp->blk_birth <= dp->dp_scrub_min_txg)
+ return;
+
+ if (scrub_pause(dp, zb))
+ return;
+
+ if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg), don't bother doing it again.
+ */
+ if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
+ return;
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for pausing
+ * again.
+ */
+ if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
+ }
+ }
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ dp->dp_spa->spa_scrub_errors++;
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+ return;
+ }
+ cbp = buf->b_data;
+
+ for (i = 0; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ scrub_visitbp(dp, dnp, buf, cbp, &czb);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ dnode_phys_t *child_dnp;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ dp->dp_spa->spa_scrub_errors++;
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+ return;
+ }
+ child_dnp = buf->b_data;
+
+ for (i = 0; i < epb; i++, child_dnp++) {
+ for (j = 0; j < child_dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset,
+ zb->zb_blkid * epb + i,
+ child_dnp->dn_nlevels - 1, j);
+ scrub_visitbp(dp, child_dnp, buf,
+ &child_dnp->dn_blkptr[j], &czb);
+ }
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+ int j;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ dp->dp_spa->spa_scrub_errors++;
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+ return;
+ }
+
+ osp = buf->b_data;
+
+ traverse_zil(dp, &osp->os_zil_header);
+
+ for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, 0,
+ osp->os_meta_dnode.dn_nlevels - 1, j);
+ scrub_visitbp(dp, &osp->os_meta_dnode, buf,
+ &osp->os_meta_dnode.dn_blkptr[j], &czb);
+ }
+ }
+
+ (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
+{
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+ scrub_visitbp(dp, NULL, NULL, bp, &zb);
+}
+
+void
+dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+ return;
+
+ if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+ SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+ } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_object, tx) != 0) {
+ return;
+ }
+
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, tx) == 0);
+ }
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+}
+
+void
+dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+ return;
+
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+ if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+ dp->dp_scrub_bookmark.zb_objset =
+ ds->ds_phys->ds_prev_snap_obj;
+ } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_object, tx) == 0) {
+ VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_phys->ds_prev_snap_obj, tx) == 0);
+ }
+}
+
+struct enqueue_clones_arg {
+ dmu_tx_t *tx;
+ uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct enqueue_clones_arg *eca = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp;
+
+ err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+ dp = ds->ds_dir->dd_pool;
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_object, eca->tx) == 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds;
+ uint64_t min_txg_save;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ min_txg_save = dp->dp_scrub_min_txg;
+ dp->dp_scrub_min_txg =
+ MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
+ scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
+ dp->dp_scrub_min_txg = min_txg_save;
+
+ if (dp->dp_scrub_pausing)
+ goto out;
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, tx) == 0);
+ }
+ if (ds->ds_phys->ds_num_children > 1) {
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ struct enqueue_clones_arg eca;
+ eca.tx = tx;
+ eca.originobj = ds->ds_object;
+
+ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ } else {
+ VERIFY(zap_join(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj,
+ dp->dp_scrub_queue_obj, tx) == 0);
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp;
+
+ err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ dp = ds->ds_dir->dd_pool;
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+ ds->ds_object, tx) == 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ boolean_t complete = B_TRUE;
+
+ if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+ return;
+
+ /* If the spa is not fully loaded, don't bother. */
+ if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+ return;
+
+ if (dp->dp_scrub_restart) {
+ enum scrub_func func = dp->dp_scrub_func;
+ dp->dp_scrub_restart = B_FALSE;
+ dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
+ }
+
+ if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+ /*
+ * We must have resumed after rebooting; reset the vdev
+ * stats to know that we're doing a scrub (although it
+ * will think we're just starting now).
+ */
+ vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+ dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+ POOL_SCRUB_EVERYTHING, B_FALSE);
+ }
+
+ dp->dp_scrub_pausing = B_FALSE;
+ dp->dp_scrub_start_time = lbolt64;
+ dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
+ dp->dp_spa->spa_scrub_active = B_TRUE;
+
+ if (dp->dp_scrub_bookmark.zb_objset == 0) {
+ /* First do the MOS & ORIGIN */
+ scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
+ if (dp->dp_scrub_pausing)
+ goto out;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ } else {
+ scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!dp->dp_scrub_pausing);
+ } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+ /*
+ * If we were paused, continue from here. Note if the
+ * ds we were paused on was deleted, the zb_objset will
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
+ if (dp->dp_scrub_pausing)
+ goto out;
+ }
+
+ /*
+ * In case we were paused right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+ /* keep pulling things out of the zap-object-as-queue */
+ while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
+ zap_cursor_retrieve(&zc, &za) == 0) {
+ VERIFY(0 == zap_remove(dp->dp_meta_objset,
+ dp->dp_scrub_queue_obj, za.za_name, tx));
+ scrub_visitds(dp, za.za_first_integer, tx);
+ if (dp->dp_scrub_pausing)
+ break;
+ zap_cursor_fini(&zc);
+ }
+ zap_cursor_fini(&zc);
+ if (dp->dp_scrub_pausing)
+ goto out;
+
+ /* done. */
+
+ dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
+ return;
+out:
+ VERIFY(0 == zap_update(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ &dp->dp_scrub_bookmark, tx));
+ VERIFY(0 == zap_update(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+ &dp->dp_spa->spa_scrub_errors, tx));
+
+ /* XXX this is scrub-clean specific */
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ while (dp->dp_spa->spa_scrub_inflight > 0) {
+ cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+ &dp->dp_spa->spa_scrub_lock);
+ }
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+}
+
+void
+dsl_pool_scrub_restart(dsl_pool_t *dp)
+{
+ mutex_enter(&dp->dp_scrub_cancel_lock);
+ dp->dp_scrub_restart = B_TRUE;
+ mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+dsl_pool_scrub_clean_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
+ spa->spa_scrub_errors++;
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_t *zb)
+{
+ size_t size = BP_GET_LSIZE(bp);
+ int d;
+ spa_t *spa = dp->dp_spa;
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+ int zio_priority;
+
+ if (dp->dp_scrub_isresilver == 0) {
+ /* It's a scrub */
+ zio_flags |= ZIO_FLAG_SCRUB;
+ zio_priority = ZIO_PRIORITY_SCRUB;
+ needs_io = B_TRUE;
+ } else {
+ /* It's a resilver */
+ zio_flags |= ZIO_FLAG_RESILVER;
+ zio_priority = ZIO_PRIORITY_RESILVER;
+ needs_io = B_FALSE;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vdev_t *vd = vdev_lookup_top(spa,
+ DVA_GET_VDEV(&bp->blk_dva[d]));
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined +=
+ DVA_GET_ASIZE(&bp->blk_dva[d]);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io) {
+ if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best we can do is look at the
+ * pool-wide DTL.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that this can't
+ * happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
+ bp->blk_birth, 1);
+ }
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ dsl_pool_scrub_clean_done, NULL, zio_priority,
+ zio_flags, zb));
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+int
+dsl_pool_scrub_clean(dsl_pool_t *dp)
+{
+ /*
+ * Purge all vdev caches. We do this here rather than in sync
+ * context because this requires a writer lock on the spa_config
+ * lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
+ dp->dp_spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(dp->dp_spa->spa_root_vdev);
+ dp->dp_spa->spa_scrub_reopen = B_FALSE;
+ spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+
+ return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
index 17deb569c4ab..21100225abf7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -30,6 +30,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
+#include <sys/cred.h>
#define DST_AVG_BLKSHIFT 14
@@ -49,6 +50,7 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
offsetof(dsl_sync_task_t, dst_node));
dstg->dstg_pool = dp;
+ dstg->dstg_cr = CRED();
return (dstg);
}
@@ -123,6 +125,16 @@ top:
}
void
+dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+ uint64_t txg;
+
+ dstg->dstg_nowaiter = B_TRUE;
+ txg = dmu_tx_get_txg(tx);
+ VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+}
+
+void
dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
{
dsl_sync_task_t *dst;
@@ -146,7 +158,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
* Check for sufficient space.
*/
dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
- dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
+ dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
/* don't bother trying again */
if (dstg->dstg_err == ERESTART)
dstg->dstg_err = EAGAIN;
@@ -171,12 +183,16 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
*/
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
+ dstg->dstg_cr, tx);
}
}
rw_exit(&dstg->dstg_pool->dp_config_rwlock);
dsl_dir_tempreserve_clear(tr_cookie, tx);
+
+ if (dstg->dstg_nowaiter)
+ dsl_sync_task_group_destroy(dstg);
}
int
@@ -194,3 +210,16 @@ dsl_sync_task_do(dsl_pool_t *dp,
dsl_sync_task_group_destroy(dstg);
return (err);
}
+
+void
+dsl_sync_task_do_nowait(dsl_pool_t *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
+{
+ dsl_sync_task_group_t *dstg;
+
+ dstg = dsl_sync_task_group_create(dp);
+ dsl_sync_task_create(dstg, checkfunc, syncfunc,
+ arg1, arg2, blocks_modified);
+ dsl_sync_task_group_nowait(dstg, tx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 0dba134cef9b..22b56d617799 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
@@ -35,6 +33,7 @@
#include <sys/zio.h>
uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* ==========================================================================
@@ -341,7 +340,7 @@ metaslab_fini(metaslab_t *msp)
int t;
vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc);
+ -msp->ms_smo.smo_alloc, B_TRUE);
metaslab_group_remove(mg, msp);
@@ -534,8 +533,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
@@ -569,10 +568,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0);
+ vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -714,11 +713,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* Allocate a block for the specified i/o.
*/
static int
-metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
- dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
{
metaslab_group_t *mg, *rotor;
- metaslab_class_t *mc;
vdev_t *vd;
int dshift = 3;
int all_zero;
@@ -728,7 +726,11 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
ASSERT(!DVA_IS_VALID(&dva[d]));
- mc = spa_metaslab_class_select(spa);
+ /*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+ if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
+ return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
@@ -754,7 +756,7 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (hintdva_avoid)
+ if (flags & METASLAB_HINTBP_AVOID)
mg = vd->vdev_mg->mg_next;
else
mg = vd->vdev_mg;
@@ -764,12 +766,34 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
} else {
mg = mc->mc_rotor;
}
- rotor = mg;
+ /*
+ * If the hint put us into the wrong class, just follow the rotor.
+ */
+ if (mg->mg_class != mc)
+ mg = mc->mc_rotor;
+
+ rotor = mg;
top:
all_zero = B_TRUE;
do {
vd = mg->mg_vd;
+ /*
+ * Don't allocate from faulted devices.
+ */
+ if (!vdev_writeable(vd))
+ goto next;
+ /*
+ * Avoid writing single-copy data to a failing vdev
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && dshift == 3) {
+ all_zero = B_FALSE;
+ goto next;
+ }
+
+ ASSERT(mg->mg_class == mc);
distance = vd->vdev_asize >> dshift;
if (distance <= (1ULL << vd->vdev_ms_shift))
@@ -818,11 +842,12 @@ top:
DVA_SET_VDEV(&dva[d], vd->vdev_id);
DVA_SET_OFFSET(&dva[d], offset);
- DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
DVA_SET_ASIZE(&dva[d], asize);
return (0);
}
+next:
mc->mc_rotor = mg->mg_next;
mc->mc_allocated = 0;
} while ((mg = mg->mg_next) != rotor);
@@ -879,38 +904,6 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
- /*
- * verify that this region is actually allocated in
- * either a ms_allocmap or the ms_map
- */
- if (msp->ms_map.sm_loaded) {
- boolean_t allocd = B_FALSE;
- int i;
-
- if (!space_map_contains(&msp->ms_map, offset, size)) {
- allocd = B_TRUE;
- } else {
- for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
- space_map_t *sm = &msp->ms_allocmap
- [(txg - i) & TXG_MASK];
- if (space_map_contains(sm,
- offset, size)) {
- allocd = B_TRUE;
- break;
- }
- }
- }
-
- if (!allocd) {
- zfs_panic_recover("freeing free segment "
- "(vdev=%llu offset=%llx size=%llx)",
- (longlong_t)vdev, (longlong_t)offset,
- (longlong_t)size);
- }
- }
-
-
}
mutex_exit(&msp->ms_lock);
@@ -946,16 +939,18 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error) {
+ if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
}
- if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
space_map_claim(&msp->ms_map, offset, size);
- space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+ }
mutex_exit(&msp->ms_lock);
@@ -963,32 +958,45 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
}
int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
- uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
- int d;
int error = 0;
+ ASSERT(bp->blk_birth == 0);
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ if (mc->mc_rotor == NULL) { /* no vdevs in this class */
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (ENOSPC);
+ }
+
ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
ASSERT(BP_GET_NDVAS(bp) == 0);
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
- for (d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
- txg, hintbp_avoid);
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ txg, flags);
if (error) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
bzero(&dva[d], sizeof (dva_t));
}
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ bp->blk_birth = txg;
+
return (0);
}
@@ -997,12 +1005,16 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
- int d;
ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+
+ spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
- for (d = 0; d < ndvas; d++)
+ for (int d = 0; d < ndvas; d++)
metaslab_free_dva(spa, &dva[d], txg, now);
+
+ spa_config_exit(spa, SCL_FREE, FTAG);
}
int
@@ -1010,14 +1022,28 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
- int d, error;
- int last_error = 0;
+ int error = 0;
ASSERT(!BP_IS_HOLE(bp));
- for (d = 0; d < ndvas; d++)
+ if (txg != 0) {
+ /*
+ * First do a dry run to make sure all DVAs are claimable,
+ * so we don't have to unwind from partial failures below.
+ */
+ if ((error = metaslab_claim(spa, bp, 0)) != 0)
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++)
if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
- last_error = error;
+ break;
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ ASSERT(error == 0 || txg == 0);
- return (last_error);
+ return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
index a2f4614fed87..5fe4e638055a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -61,11 +60,13 @@ refcount_fini(void)
void
refcount_create(refcount_t *rc)
{
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&rc->rc_list, sizeof (reference_t),
offsetof(reference_t, ref_link));
list_create(&rc->rc_removed, sizeof (reference_t),
offsetof(reference_t, ref_link));
- mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+ rc->rc_count = 0;
+ rc->rc_removed_count = 0;
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
new file mode 100644
index 000000000000..db3b70fc68b0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t. This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting. If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed. Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock). Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out. At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed. If they are not, then the reader blocks for the
+ * waiting writers. Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+ struct rrw_node *rn_next;
+ rrwlock_t *rn_rrl;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ if (refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (NULL);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl)
+ return (rn);
+ }
+ return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+ rn->rn_rrl = rrl;
+ rn->rn_next = tsd_get(rrw_tsd_key);
+ VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+ rrw_node_t *prev = NULL;
+
+ if (refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (B_FALSE);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl) {
+ if (prev)
+ prev->rn_next = rn->rn_next;
+ else
+ VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+ kmem_free(rn, sizeof (*rn));
+ return (B_TRUE);
+ }
+ prev = rn;
+ }
+ return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl)
+{
+ mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+ rrl->rr_writer = NULL;
+ refcount_create(&rrl->rr_anon_rcount);
+ refcount_create(&rrl->rr_linked_rcount);
+ rrl->rr_writer_wanted = B_FALSE;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+ mutex_destroy(&rrl->rr_lock);
+ cv_destroy(&rrl->rr_cv);
+ ASSERT(rrl->rr_writer == NULL);
+ refcount_destroy(&rrl->rr_anon_rcount);
+ refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+ ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+ while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+ refcount_is_zero(&rrl->rr_anon_rcount) &&
+ rrn_find(rrl) == NULL))
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+ if (rrl->rr_writer_wanted) {
+ /* may or may not be a re-entrant enter */
+ rrn_add(rrl);
+ (void) refcount_add(&rrl->rr_linked_rcount, tag);
+ } else {
+ (void) refcount_add(&rrl->rr_anon_rcount, tag);
+ }
+ ASSERT(rrl->rr_writer == NULL);
+ mutex_exit(&rrl->rr_lock);
+}
+
+static void
+rrw_enter_write(rrwlock_t *rrl)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+
+ while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
+ refcount_count(&rrl->rr_linked_rcount) > 0 ||
+ rrl->rr_writer != NULL) {
+ rrl->rr_writer_wanted = B_TRUE;
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+ }
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_writer = curthread;
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrw_enter_read(rrl, tag);
+ else
+ rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !refcount_is_zero(&rrl->rr_linked_rcount) ||
+ rrl->rr_writer != NULL);
+
+ if (rrl->rr_writer == NULL) {
+ if (rrn_find_and_remove(rrl)) {
+ if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
+ cv_broadcast(&rrl->rr_cv);
+
+ } else {
+ if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
+ cv_broadcast(&rrl->rr_cv);
+ }
+ } else {
+ ASSERT(rrl->rr_writer == curthread);
+ ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
+ refcount_is_zero(&rrl->rr_linked_rcount));
+ rrl->rr_writer = NULL;
+ cv_broadcast(&rrl->rr_cv);
+ }
+ mutex_exit(&rrl->rr_lock);
+}
+
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+ boolean_t held;
+
+ mutex_enter(&rrl->rr_lock);
+ if (rw == RW_WRITER) {
+ held = (rrl->rr_writer == curthread);
+ } else {
+ held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !refcount_is_zero(&rrl->rr_linked_rcount));
+ }
+ mutex_exit(&rrl->rr_lock);
+
+ return (held);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
index ce5c26131af5..ca7076cb6fd9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -31,20 +30,20 @@
#include <sys/zio_checksum.h>
/*
- * SHA-256 checksum, as specified in FIPS 180-2, available at:
- * http://csrc.nist.gov/cryptval
+ * SHA-256 checksum, as specified in FIPS 180-3, available at:
+ * http://csrc.nist.gov/publications/PubsFIPS.html
*
* This is a very compact implementation of SHA-256.
* It is designed to be simple and portable, not to be fast.
*/
/*
- * The literal definitions according to FIPS180-2 would be:
+ * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
*
- * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
- * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ * Ch(x, y, z) (x & y) ^ (~x & z)
+ * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z)
*
- * We use logical equivalents which require one less op.
+ * We use equivalent logical reductions here that require one less op.
*/
#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
@@ -105,20 +104,19 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
uint8_t pad[128];
- int padsize = size & 63;
- int i;
+ int i, padsize;
- for (i = 0; i < size - padsize; i += 64)
+ for (i = 0; i < (size & ~63ULL); i += 64)
SHA256Transform(H, (uint8_t *)buf + i);
- for (i = 0; i < padsize; i++)
- pad[i] = ((uint8_t *)buf)[i];
+ for (padsize = 0; i < size; i++)
+ pad[padsize++] = *((uint8_t *)buf + i);
for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
pad[padsize] = 0;
- for (i = 0; i < 8; i++)
- pad[padsize++] = (size << 3) >> (56 - 8 * i);
+ for (i = 56; i >= 0; i -= 8)
+ pad[padsize++] = (size << 3) >> i;
for (i = 0; i < padsize; i += 64)
SHA256Transform(H, pad + i);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 6a7c525ae991..163b21572247 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a
@@ -56,16 +54,388 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/fs/zfs.h>
+#include <sys/arc.h>
#include <sys/callb.h>
#include <sys/sunddi.h>
+#include <sys/spa_boot.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
-int zio_taskq_threads = 0;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
-TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads);
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW,
- &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type");
+int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+ /* ISSUE INTR */
+ { 1, 1 }, /* ZIO_TYPE_NULL */
+ { 1, 8 }, /* ZIO_TYPE_READ */
+ { 8, 1 }, /* ZIO_TYPE_WRITE */
+ { 1, 1 }, /* ZIO_TYPE_FREE */
+ { 1, 1 }, /* ZIO_TYPE_CLAIM */
+ { 1, 1 }, /* ZIO_TYPE_IOCTL */
+};
+static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static boolean_t spa_has_active_shared_spare(spa_t *spa);
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+ uint64_t intval, zprop_source_t src)
+{
+ const char *propname = zpool_prop_to_name(prop);
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+ if (strval != NULL)
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+ else
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+ uint64_t size = spa_get_space(spa);
+ uint64_t used = spa_get_alloc(spa);
+ uint64_t cap, version;
+ zprop_source_t src = ZPROP_SRC_NONE;
+ spa_config_dirent_t *dp;
+
+ ASSERT(MUTEX_HELD(&spa->spa_props_lock));
+
+ /*
+ * readonly properties
+ */
+ spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
+
+ cap = (size == 0) ? 0 : (used * 100 / size);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+ spa->spa_root_vdev->vdev_state, src);
+
+ /*
+ * settable properties that are not stored in the pool property object.
+ */
+ version = spa_version(spa);
+ if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+
+ if (spa->spa_root != NULL)
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+ 0, ZPROP_SRC_LOCAL);
+
+ if ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ if (dp->scd_path == NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ "none", 0, ZPROP_SRC_LOCAL);
+ } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ dp->scd_path, 0, ZPROP_SRC_LOCAL);
+ }
+ }
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_props_lock);
+
+ /*
+ * Get properties from the spa config.
+ */
+ spa_prop_get_config(spa, nvp);
+
+ /* If no pool property object, no more prop to get. */
+ if (spa->spa_pool_props_object == 0) {
+ mutex_exit(&spa->spa_props_lock);
+ return (0);
+ }
+
+ /*
+ * Get properties from the MOS pool property object.
+ */
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t intval = 0;
+ char *strval = NULL;
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ zpool_prop_t prop;
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
+ continue;
+
+ switch (za.za_integer_length) {
+ case 8:
+ /* integer property */
+ if (za.za_first_integer !=
+ zpool_prop_default_numeric(prop))
+ src = ZPROP_SRC_LOCAL;
+
+ if (prop == ZPOOL_PROP_BOOTFS) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds)) {
+ rw_exit(&dp->dp_config_rwlock);
+ break;
+ }
+
+ strval = kmem_alloc(
+ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+ KM_SLEEP);
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_rele(ds, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ } else {
+ strval = NULL;
+ intval = za.za_first_integer;
+ }
+
+ spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+ if (strval != NULL)
+ kmem_free(strval,
+ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+
+ break;
+
+ case 1:
+ /* string property */
+ strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+ err = zap_lookup(mos, spa->spa_pool_props_object,
+ za.za_name, 1, za.za_num_integers, strval);
+ if (err) {
+ kmem_free(strval, za.za_num_integers);
+ break;
+ }
+ spa_prop_add_list(*nvp, prop, strval, 0, src);
+ kmem_free(strval, za.za_num_integers);
+ break;
+
+ default:
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ mutex_exit(&spa->spa_props_lock);
+out:
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ *nvp = NULL;
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+ nvpair_t *elem;
+ int error = 0, reset_bootfs = 0;
+ uint64_t objnum;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ zpool_prop_t prop;
+ char *propname, *strval;
+ uint64_t intval;
+ objset_t *os;
+ char *slash;
+
+ propname = nvpair_name(elem);
+
+ if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
+ return (EINVAL);
+
+ switch (prop) {
+ case ZPOOL_PROP_VERSION:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error &&
+ (intval < spa_version(spa) || intval > SPA_VERSION))
+ error = EINVAL;
+ break;
+
+ case ZPOOL_PROP_DELEGATION:
+ case ZPOOL_PROP_AUTOREPLACE:
+ case ZPOOL_PROP_LISTSNAPS:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = EINVAL;
+ break;
+
+ case ZPOOL_PROP_BOOTFS:
+ if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+ error = ENOTSUP;
+ break;
+ }
+
+ /*
+ * Make sure the vdev config is bootable
+ */
+ if (!vdev_is_bootable(spa->spa_root_vdev)) {
+ error = ENOTSUP;
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ error = nvpair_value_string(elem, &strval);
+
+ if (!error) {
+ uint64_t compress;
+
+ if (strval == NULL || strval[0] == '\0') {
+ objnum = zpool_prop_default_numeric(
+ ZPOOL_PROP_BOOTFS);
+ break;
+ }
+
+ if (error = dmu_objset_open(strval, DMU_OST_ZFS,
+ DS_MODE_USER | DS_MODE_READONLY, &os))
+ break;
+
+ /* We don't support gzip bootable datasets */
+ if ((error = dsl_prop_get_integer(strval,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+ &compress, NULL)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(compress)) {
+ error = ENOTSUP;
+ } else {
+ objnum = dmu_objset_id(os);
+ }
+ dmu_objset_close(os);
+ }
+ break;
+
+ case ZPOOL_PROP_FAILUREMODE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+ intval > ZIO_FAILURE_MODE_PANIC))
+ error = EINVAL;
+
+ /*
+ * This is a special case which only occurs when
+ * the pool has completely failed. This allows
+ * the user to change the in-core failmode property
+ * without syncing it out to disk (I/Os might
+ * currently be blocked). We do this by returning
+ * EIO to the caller (spa_prop_set) to trick it
+ * into thinking we encountered a property validation
+ * error.
+ */
+ if (!error && spa_suspended(spa)) {
+ spa->spa_failmode = intval;
+ error = EIO;
+ }
+ break;
+
+ case ZPOOL_PROP_CACHEFILE:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+
+ if (strval[0] == '\0')
+ break;
+
+ if (strcmp(strval, "none") == 0)
+ break;
+
+ if (strval[0] != '/') {
+ error = EINVAL;
+ break;
+ }
+
+ slash = strrchr(strval, '/');
+ ASSERT(slash != NULL);
+
+ if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+ strcmp(slash, "/..") == 0)
+ error = EINVAL;
+ break;
+ }
+
+ if (error)
+ break;
+ }
+
+ if (!error && reset_bootfs) {
+ error = nvlist_remove(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+ if (!error) {
+ error = nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+ }
+ }
+
+ return (error);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+ int error;
+
+ if ((error = spa_prop_validate(spa, nvp)) != 0)
+ return (error);
+
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+ spa, nvp, 3));
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
/*
* ==========================================================================
@@ -117,40 +487,26 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
static void
spa_activate(spa_t *spa)
{
- int t;
- int nthreads = zio_taskq_threads;
- char name[32];
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_normal_class = metaslab_class_create();
+ spa->spa_log_class = metaslab_class_create();
- if (nthreads == 0)
- nthreads = max_ncpus;
- for (t = 0; t < ZIO_TYPES; t++) {
- snprintf(name, sizeof(name), "spa_zio_issue %d", t);
- spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads,
- maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
- snprintf(name, sizeof(name), "spa_zio_intr %d", t);
- spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads,
- maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
+ zio_taskq_threads[t][q], maxclsyspri, 50,
+ INT_MAX, TASKQ_PREPOPULATE);
+ }
}
- rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
-
- mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&spa->spa_dirty_list, sizeof (vdev_t),
- offsetof(vdev_t, vdev_dirty_node));
+ list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_config_dirty_node));
+ list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_state_dirty_node));
txg_list_create(&spa->spa_vdev_txg_list,
offsetof(struct vdev, vdev_txg_node));
@@ -169,8 +525,6 @@ spa_activate(spa_t *spa)
static void
spa_deactivate(spa_t *spa)
{
- int t;
-
ASSERT(spa->spa_sync_on == B_FALSE);
ASSERT(spa->spa_dsl_pool == NULL);
ASSERT(spa->spa_root_vdev == NULL);
@@ -179,18 +533,22 @@ spa_deactivate(spa_t *spa)
txg_list_destroy(&spa->spa_vdev_txg_list);
- list_destroy(&spa->spa_dirty_list);
+ list_destroy(&spa->spa_config_dirty_list);
+ list_destroy(&spa->spa_state_dirty_list);
- for (t = 0; t < ZIO_TYPES; t++) {
- taskq_destroy(spa->spa_zio_issue_taskq[t]);
- taskq_destroy(spa->spa_zio_intr_taskq[t]);
- spa->spa_zio_issue_taskq[t] = NULL;
- spa->spa_zio_intr_taskq[t] = NULL;
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ taskq_destroy(spa->spa_zio_taskq[t][q]);
+ spa->spa_zio_taskq[t][q] = NULL;
+ }
}
metaslab_class_destroy(spa->spa_normal_class);
spa->spa_normal_class = NULL;
+ metaslab_class_destroy(spa->spa_log_class);
+ spa->spa_log_class = NULL;
+
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
@@ -200,16 +558,6 @@ spa_deactivate(spa_t *spa)
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
- rw_destroy(&spa->spa_traverse_lock);
- mutex_destroy(&spa->spa_uberblock_lock);
- mutex_destroy(&spa->spa_errlog_lock);
- mutex_destroy(&spa->spa_errlist_lock);
- mutex_destroy(&spa->spa_config_lock.scl_lock);
- cv_destroy(&spa->spa_config_lock.scl_cv);
- mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
- mutex_destroy(&spa->spa_history_lock);
- mutex_destroy(&spa->spa_props_lock);
-
spa->spa_state = POOL_STATE_UNINITIALIZED;
}
@@ -233,8 +581,13 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
if ((*vdp)->vdev_ops->vdev_op_leaf)
return (0);
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) != 0) {
+ error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error) {
vdev_free(*vdp);
*vdp = NULL;
return (EINVAL);
@@ -263,6 +616,8 @@ spa_unload(spa_t *spa)
{
int i;
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
/*
* Stop async tasks.
*/
@@ -277,10 +632,17 @@ spa_unload(spa_t *spa)
}
/*
- * Wait for any outstanding prefetch I/O to complete.
+ * Wait for any outstanding async I/O to complete.
+ */
+ mutex_enter(&spa->spa_async_root_lock);
+ while (spa->spa_async_root_count != 0)
+ cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
+ mutex_exit(&spa->spa_async_root_lock);
+
+ /*
+ * Drop and purge level 2 cache
*/
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa_config_exit(spa, FTAG);
+ spa_l2cache_drop(spa);
/*
* Close the dsl pool.
@@ -297,16 +659,31 @@ spa_unload(spa_t *spa)
vdev_free(spa->spa_root_vdev);
ASSERT(spa->spa_root_vdev == NULL);
- for (i = 0; i < spa->spa_nspares; i++)
- vdev_free(spa->spa_spares[i]);
- if (spa->spa_spares) {
- kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
- spa->spa_spares = NULL;
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
+ if (spa->spa_spares.sav_vdevs) {
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+ spa->spa_spares.sav_vdevs = NULL;
+ }
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
}
- if (spa->spa_sparelist) {
- nvlist_free(spa->spa_sparelist);
- spa->spa_sparelist = NULL;
+ spa->spa_spares.sav_count = 0;
+
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++)
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ if (spa->spa_l2cache.sav_vdevs) {
+ kmem_free(spa->spa_l2cache.sav_vdevs,
+ spa->spa_l2cache.sav_count * sizeof (void *));
+ spa->spa_l2cache.sav_vdevs = NULL;
}
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ }
+ spa->spa_l2cache.sav_count = 0;
spa->spa_async_suspended = 0;
}
@@ -314,8 +691,8 @@ spa_unload(spa_t *spa)
/*
* Load (or re-load) the current list of vdevs describing the active spares for
* this pool. When this is called, we have some form of basic information in
- * 'spa_sparelist'. We parse this into vdevs, try to o