diff options
author | Pawel Jakub Dawidek <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
---|---|---|
committer | Pawel Jakub Dawidek <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
commit | 1ba4a712dde6e6c613fc411a96958b4ade67de4c (patch) | |
tree | 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts | |
parent | 8fc061164d74a4c9775f39da3c0b5d02112866c8 (diff) | |
download | src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.tar.gz src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.zip |
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
Notes
Notes:
svn path=/head/; revision=185029
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
144 files changed, 65243 insertions, 9789 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 1800e792fa1e..cf49c78a5b0e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -20,11 +20,9 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This Makefile defines all file modules for the directory uts/common # and its children. These are the source files which may be considered # common to all SunOS systems. @@ -46,7 +44,9 @@ ZFS_COMMON_OBJS += \ dsl_pool.o \ dsl_synctask.o \ dmu_zfetch.o \ + dsl_deleg.o \ dsl_prop.o \ + dsl_scrub.o \ fletcher.o \ gzip.o \ lzjb.o \ @@ -64,6 +64,7 @@ ZFS_COMMON_OBJS += \ unique.o \ vdev.o \ vdev_cache.o \ + vdev_file.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ @@ -75,6 +76,7 @@ ZFS_COMMON_OBJS += \ zap_micro.o \ zfs_byteswap.o \ zfs_fm.o \ + zfs_fuid.o \ zfs_znode.o \ zil.o \ zio.o \ @@ -84,7 +86,11 @@ ZFS_COMMON_OBJS += \ ZFS_SHARED_OBJS += \ zfs_namecheck.o \ - zfs_prop.o + zfs_deleg.o \ + zfs_prop.o \ + zfs_comutil.o \ + zpool_prop.o \ + zprop_common.o ZFS_OBJS += \ $(ZFS_COMMON_OBJS) \ @@ -96,6 +102,7 @@ ZFS_OBJS += \ zfs_log.o \ zfs_replay.o \ zfs_rlock.o \ + rrwlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index dd2aa82304ab..d9eb88a40202 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -20,7 +20,7 @@ */ /* Portions Copyright 2007 Shivakumar GN */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/mutex.h> #include <sys/sysmacros.h> #include <sys/systm.h> +#include <sys/sunddi.h> #include <sys/uio.h> #include <sys/vfs.h> #include <sys/vnode.h> @@ -60,7 +61,7 @@ * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, - * without enforcing the filesystem to hand over management to GFS. The + * without forcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() @@ -116,6 +117,42 @@ */ /* + * gfs_get_parent_ino: used to obtain a parent inode number and the + * inode number of the given vnode in preparation for calling gfs_readdir_init. + */ +int +gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + ino64_t *pino, ino64_t *ino) +{ + vnode_t *parent; + gfs_dir_t *dp = dvp->v_data; + int error; + + *ino = dp->gfsd_file.gfs_ino; + parent = dp->gfsd_file.gfs_parent; + + if (parent == NULL) { + *pino = *ino; /* root of filesystem */ + } else if (dvp->v_flag & V_XATTRDIR) { +#ifdef TODO + vattr_t va; + + va.va_mask = AT_NODEID; + error = VOP_GETATTR(parent, &va, 0, cr, ct); + if (error) + return (error); + *pino = va.va_nodeid; +#else + panic("%s:%u: not implemented", __func__, __LINE__); +#endif + } else { + *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; + } + + return (0); +} + +/* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length @@ -123,6 +160,7 @@ * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode + * flags - flags from VOP_READDIR * * Returns 0 or a non-zero errno. * @@ -153,8 +191,10 @@ */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, - uio_t *uiop, ino64_t parent, ino64_t self) + uio_t *uiop, ino64_t parent, ino64_t self, int flags) { + size_t dirent_size; + if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); @@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; - st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); + if (flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); st->grd_parent = parent; st->grd_self = self; + st->grd_flags = flags; return (0); } @@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, /* * gfs_readdir_emit_int: internal routine to emit directory entry * - * st - the current readdir state, which must have d_ino and d_name - * set + * st - the current readdir state, which must have d_ino/ed_ino + * and d_name/ed_name set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ @@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; + dirent64_t *dp; + edirent_t *edp; - namlen = strlen(st->grd_dirent->d_name); - reclen = DIRENT64_RECLEN(namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp = st->grd_dirent; + namlen = strlen(edp->ed_name); + reclen = EDIRENT_RECLEN(namlen); + } else { + dp = st->grd_dirent; + namlen = strlen(dp->d_name); + reclen = DIRENT64_RECLEN(namlen); + } if (reclen > uiop->uio_resid) { /* @@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, return (-1); } - /* XXX: This can change in the future. */ - st->grd_dirent->d_type = DT_DIR; - st->grd_dirent->d_reclen = (ushort_t)reclen; - st->grd_dirent->d_namlen = namlen; + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp->ed_off = next; + edp->ed_reclen = (ushort_t)reclen; + } else { + /* XXX: This can change in the future. */ + dp->d_reclen = (ushort_t)reclen; + dp->d_type = DT_DIR; + dp->d_namlen = namlen; + } if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); @@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name + * eflags - value for ed_eflags (if processing edirent_t) * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or @@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, const char *name, int *ncookies, u_long **cookies) + ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; - st->grd_dirent->d_ino = ino; - (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edirent_t *edp = st->grd_dirent; + + edp->ed_ino = ino; + (void) strncpy(edp->ed_name, name, st->grd_namlen); + edp->ed_eflags = eflags; + } else { + dirent64_t *dp = st->grd_dirent; + + dp->d_ino = ino; + (void) strncpy(dp->d_name, name, st->grd_namlen); + } /* * Inter-entry offsets are invalid, so we assume a record size of @@ -266,11 +336,11 @@ top: voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, - ".", ncookies, cookies)) == 0) + ".", 0, ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, - "..", ncookies, cookies)) == 0) + "..", 0, ncookies, cookies)) == 0) goto top; } else { *voffp = voff; @@ -292,7 +362,13 @@ top: int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { - kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); + size_t dirent_size; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + kmem_free(st->grd_dirent, dirent_size); if (error > 0) return (error); if (eofp) @@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp) gfs_dir_t *dp = NULL; void *data; - if (fp->gfs_parent == NULL) + if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) goto found; dp = fp->gfs_parent->v_data; @@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp) ge = NULL; found: + if (vp->v_flag & V_XATTRDIR) + VI_LOCK(fp->gfs_parent); VI_LOCK(vp); ASSERT(vp->v_count < 2); /* @@ -535,7 +613,8 @@ found: * Free vnode and release parent */ if (fp->gfs_parent) { - gfs_dir_unlock(dp); + if (dp) + gfs_dir_unlock(dp); VI_LOCK(fp->gfs_parent); fp->gfs_parent->v_usecount--; VI_UNLOCK(fp->gfs_parent); @@ -543,6 +622,8 @@ found: ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } + if (vp->v_flag & V_XATTRDIR) + VI_UNLOCK(fp->gfs_parent); return (data); } @@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp) } /* - * gfs_dir_lookup() + * gfs_dir_lookup_dynamic() * - * Looks up the given name in the directory and returns the corresponding vnode, - * if found. + * This routine looks up the provided name amongst the dynamic entries + * in the gfs directory and returns the corresponding vnode, if found. * - * First, we search statically defined entries, if any. If a match is found, - * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the - * existing vnode. Otherwise, we call the static entry's callback routine, - * caching the result if necessary. + * The gfs directory is expected to be locked by the caller prior to + * calling this function. The directory will be unlocked during the + * execution of this function, but will be locked upon return from the + * function. This function returns 0 on success, non-zero on error. * - * If no static entry is found, we invoke the lookup callback, if any. The - * arguments to this callback are: + * The dynamic lookups are performed by invoking the lookup + * callback, which is passed to this function as the first argument. + * The arguments to the callback are: * - * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); + * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, + * int flags, int *deflgs, pathname_t *rpnp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode + * cr - pointer to cred + * flags - flags value from lookup request + * ignored here; currently only used to request + * insensitive lookups + * direntflgs - output parameter, directory entry flags + * ignored here; currently only used to indicate a lookup + * has more than one possible match when case is not considered + * realpnp - output parameter, real pathname + * ignored here; when lookup was performed case-insensitively, + * this field contains the "real" name of the file. * * Returns 0 on success, non-zero on error. */ -int -gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) +static int +gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, + const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, + int *direntflags, pathname_t *realpnp) { - int i; - gfs_dirent_t *ge; - vnode_t *vp; - gfs_dir_t *dp = dvp->v_data; - int ret = 0; - - ASSERT(dvp->v_type == VDIR); + gfs_file_t *fp; + ino64_t ino; + int ret; - if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) - return (0); + ASSERT(GFS_DIR_LOCKED(dp)); + /* + * Drop the directory lock, as the lookup routine + * will need to allocate memory, or otherwise deadlock on this + * directory. + */ + gfs_dir_unlock(dp); + ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); gfs_dir_lock(dp); /* + * The callback for extended attributes returns a vnode + * with v_data from an underlying fs. + */ + if (ret == 0 && !IS_XATTRDIR(dvp)) { + fp = (gfs_file_t *)((*vpp)->v_data); + fp->gfs_index = -1; + fp->gfs_ino = ino; + } + + return (ret); +} + +/* + * gfs_dir_lookup_static() + * + * This routine looks up the provided name amongst the static entries + * in the gfs directory and returns the corresponding vnode, if found. + * The first argument to the function is a pointer to the comparison + * function this function should use to decide if names are a match. + * + * If a match is found, and GFS_CACHE_VNODE is set and the vnode + * exists, we simply return the existing vnode. Otherwise, we call + * the static entry's callback routine, caching the result if + * necessary. If the idx pointer argument is non-NULL, we use it to + * return the index of the matching static entry. + * + * The gfs directory is expected to be locked by the caller prior to calling + * this function. The directory may be unlocked during the execution of + * this function, but will be locked upon return from the function. + * + * This function returns 0 if a match is found, ENOENT if not. + */ +static int +gfs_dir_lookup_static(int (*compare)(const char *, const char *), + gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, + vnode_t **vpp, pathname_t *rpnp) +{ + gfs_dirent_t *ge; + vnode_t *vp = NULL; + int i; + + ASSERT(GFS_DIR_LOCKED(dp)); + + /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; - if (strcmp(ge->gfse_name, nm) == 0) { + if (compare(ge->gfse_name, nm) == 0) { + if (rpnp) + (void) strlcpy(rpnp->pn_buf, ge->gfse_name, + rpnp->pn_bufsize); + if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); - goto out; + break; } /* @@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set - * for this entry, we discard the result in favor of the - * cached vnode. + * for this entry, we discard the result in favor of + * the cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); @@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) gfs_dir_lock(dp); } } - - goto out; + break; } } - /* - * See if there is a dynamic constructor. - */ - if (dp->gfsd_lookup) { - ino64_t ino; - gfs_file_t *fp; + if (vp == NULL) + return (ENOENT); + else if (idx) + *idx = i; + *vpp = vp; + return (0); +} - /* - * Once again, drop the directory lock, as the lookup routine - * will need to allocate memory, or otherwise deadlock on this - * directory. - */ - gfs_dir_unlock(dp); - ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); - gfs_dir_lock(dp); - if (ret != 0) - goto out; +/* + * gfs_dir_lookup() + * + * Looks up the given name in the directory and returns the corresponding + * vnode, if found. + * + * First, we search statically defined entries, if any, with a call to + * gfs_dir_lookup_static(). If no static entry is found, and we have + * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). + * + * This function returns 0 on success, non-zero on error. + */ +int +gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, + int flags, int *direntflags, pathname_t *realpnp) +{ + gfs_dir_t *dp = dvp->v_data; + boolean_t casecheck; + vnode_t *dynvp = NULL; + vnode_t *vp = NULL; + int (*compare)(const char *, const char *); + int error, idx; - fp = (gfs_file_t *)vp->v_data; - fp->gfs_index = -1; - fp->gfs_ino = ino; - } else { - /* - * No static entry found, and there is no lookup callback, so - * return ENOENT. - */ - ret = ENOENT; + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) + return (0); + + casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; + if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || + (flags & FIGNORECASE)) + compare = strcasecmp; + else + compare = strcmp; + + gfs_dir_lock(dp); + + error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); + + if (vp && casecheck) { + gfs_dirent_t *ge; + int i; + + for (i = idx + 1; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (strcasecmp(ge->gfse_name, nm) == 0) { + *direntflags |= ED_CASE_CONFLICT; + goto out; + } + } + } + + if ((error || casecheck) && dp->gfsd_lookup) + error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, + &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); + + if (vp && dynvp) { + /* static and dynamic entries are case-insensitive conflict */ + ASSERT(casecheck); + *direntflags |= ED_CASE_CONFLICT; + VN_RELE(dynvp); + } else if (vp == NULL) { + vp = dynvp; + } else if (error == ENOENT) { + error = 0; + } else if (error) { + VN_RELE(vp); + vp = NULL; } out: gfs_dir_unlock(dp); - if (ret == 0) - *vpp = vp; - else - *vpp = NULL; - - return (ret); + *vpp = vp; + return (error); } /* @@ -731,13 +921,15 @@ out: * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * - * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - * offset_t *off, offset_t *nextoff, void *data) + * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, + * offset_t *off, offset_t *nextoff, void *data, int flags) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and - * d_ino. + * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags + * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS + * is set in 'flags'. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left @@ -745,12 +937,13 @@ out: * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data + * flags - VOP_READDIR flags * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, - u_long **cookies, void *data) + u_long **cookies, void *data, cred_t *cr, int flags) { gfs_readdir_state_t gstate; int error, eof = 0; @@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, offset_t off, next; gfs_dir_t *dp = dvp->v_data; - ino = dp->gfsd_file.gfs_ino; - - if (dp->gfsd_file.gfs_parent == NULL) - pino = ino; /* root of filesystem */ - else - pino = ((gfs_file_t *) - (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; + error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino); + if (error) + return (error); if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, - pino, ino)) != 0) + pino, ino, flags)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, @@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, - off, ino, dp->gfsd_static[off].gfse_name, ncookies, - cookies)) != 0) + off, ino, dp->gfsd_static[off].gfse_name, 0, + ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { @@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, - data)) != 0 || eof) + data, flags)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; @@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, } /* + * gfs_vop_lookup: VOP_LOOKUP() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_lookup() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); +} + +/* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls @@ -827,6 +1031,7 @@ gfs_vop_readdir(ap) { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; + cred_t *cr = ap->a_cred; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; @@ -842,7 +1047,8 @@ gfs_vop_readdir(ap) *ap->a_ncookies = ncookies; } - error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); + error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL, + cr, 0); if (error == 0) { /* Subtract unused cookies */ @@ -882,6 +1088,9 @@ gfs_vop_inactive(ap) if (data != NULL) kmem_free(data, fp->gfs_size); + + VI_LOCK(vp); vp->v_data = NULL; + VI_UNLOCK(vp); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c new file mode 100644 index 000000000000..00a10aae8ec9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/vnode.h> + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 420f802f360d..7ca528033c4f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DVA-based Adjustable Replacement Cache * @@ -47,13 +45,13 @@ * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slowes the flow of new data - * into the cache until we can make space avaiable. + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory preasure from the + * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * @@ -75,7 +73,7 @@ * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() inerface + * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the @@ -109,6 +107,14 @@ * * Note that the majority of the performance stats are manipulated * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists */ #include <sys/spa.h> @@ -117,6 +123,7 @@ #include <sys/zfs_context.h> #include <sys/arc.h> #include <sys/refcount.h> +#include <sys/vdev.h> #ifdef _KERNEL #include <sys/dnlc.h> #endif @@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; @@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan; static int arc_dead; /* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* * These tunables are for performance analysis. */ -u_long zfs_arc_max; -u_long zfs_arc_min; -TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); -TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_mdcomp_disable = 0; + +TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); +TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); +TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); +TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); SYSCTL_DECL(_vfs_zfs); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* - * Note that buffers can be on one of 5 states: + * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache - * When there are no active references to the buffer, they - * are linked onto one of the lists in arc. These are the - * only buffers that can be evicted or deleted. + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies @@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. */ typedef struct arc_state { - list_t arcs_list; /* linked list of evictable buffer in state */ - uint64_t arcs_lsize; /* total size of buffers in the linked list */ - uint64_t arcs_size; /* total size of all buffers in this state */ + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; } arc_state_t; -/* The 5 states: */ +/* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; @@ -222,6 +259,24 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; } arc_stats_t; static arc_stats_t arc_stats = { @@ -252,7 +307,25 @@ static arc_stats_t arc_stats = { { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 } + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -299,6 +372,7 @@ static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- @@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN, + &arc_meta_used, 0, "ARC metadata used"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN, + &arc_meta_limit, 0, "ARC metadata limit"); + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; - arc_byteswap_func_t *acb_byteswap; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; @@ -368,6 +450,9 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; @@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up @@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ +#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines @@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + daddr_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + static uint64_t -buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) { uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; @@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_hdr_t)); refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); return (0); } @@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + rw_destroy(&buf->b_lock); } /* @@ -639,7 +854,7 @@ retry: hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); } +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + static void -arc_cksum_compute(arc_buf_t *buf) +arc_cksum_compute(arc_buf_t *buf, boolean_t force) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); @@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf) void arc_buf_thaw(arc_buf_t *buf) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } - if (buf->b_hdr->b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); @@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf) ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); } static void @@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); mutex_enter(&ab->b_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->arcs_list, ab); + list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); - ASSERT3U(ab->b_state->arcs_lsize, >=, delta); - atomic_add_64(&ab->b_state->arcs_lsize, -delta); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); mutex_exit(&ab->b_state->arcs_mtx); - /* remove the prefetch flag is we get a reference */ + /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; } @@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list, ab); + list_insert_head(&state->arcs_list[ab->b_type], ab); ASSERT(ab->b_datacnt > 0); - atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); - ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list, ab); + list_remove(&old_state->arcs_list[ab->b_type], ab); /* * If prefetching out of the ghost cache, @@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } - ASSERT3U(old_state->arcs_lsize, >=, from_delta); - atomic_add_64(&old_state->arcs_lsize, -from_delta); + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list, ab); + list_insert_head(&new_state->arcs_list[ab->b_type], ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } - atomic_add_64(&new_state->arcs_lsize, to_delta); - ASSERT3U(new_state->arcs_size + to_delta, >=, - new_state->arcs_lsize); + atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(&new_state->arcs_mtx); @@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && old_state != arc_anon) { + if (new_state == arc_anon) { buf_hash_remove(ab); } @@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space) +{ + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space) +{ + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); } arc_buf_t * @@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa; hdr->b_state = arc_anon; hdr->b_arc_access = 0; - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock; /* - * Check to see if this buffer is currently being evicted via - * arc_do_user_evicts(). + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. */ - mutex_enter(&arc_eviction_mtx); - hdr = buf->b_hdr; - if (hdr == NULL) { - mutex_exit(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_READER); + if (buf->b_data == NULL) { + rw_exit(&buf->b_lock); return; } + hdr = buf->b_hdr; + ASSERT(hdr != NULL); hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - mutex_enter(hash_lock); - if (buf->b_data == NULL) { - /* - * This buffer is evicted. - */ - mutex_exit(hash_lock); - return; - } + rw_exit(&buf->b_lock); - ASSERT(buf->b_hdr == hdr); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); arc_access(hdr, hash_lock); @@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { @@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + atomic_add_64(&arc_size, -size); } - atomic_add_64(&arc_size, -size); } if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); - ASSERT3U(state->arcs_lsize, >=, size); - atomic_add_64(&state->arcs_lsize, -size); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); @@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!(hdr->b_flags & ARC_STORED)); + + if (hdr->b_l2hdr != NULL) { + if (!MUTEX_HELD(&l2arc_buflist_mtx)) { + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + mutex_enter(&l2arc_buflist_mtx); + if (hdr->b_l2hdr != NULL) { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, + hdr); + } + mutex_exit(&l2arc_buflist_mtx); + } else { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + } + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); @@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_destroy(&hdr->b_freeze_lock); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf) * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; @@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; @@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; + if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + missed += 1; + break; + } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && - ab->b_size == bytes) { + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } @@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); } else { + rw_exit(&buf->b_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } - ASSERT(ab->b_datacnt == 0); - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags = ARC_IN_HASH_TABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) @@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + return (stolen); } @@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, int64_t bytes) +arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes) ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; - arc_hdr_destroy(ab); + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; @@ -1256,6 +1630,12 @@ top: } mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); @@ -1271,38 +1651,58 @@ arc_adjust(void) { int64_t top_sz, mru_over, arc_over, todelete; - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } - if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); - (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, + ARC_BUFC_METADATA); top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0) { - if (arc_mru_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); - arc_evict_ghost(arc_mru_ghost, todelete); + if (arc_mru_ghost->arcs_size > 0) { + todelete = MIN(arc_mru_ghost->arcs_size, mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); } } if ((arc_over = arc_size - arc_c) > 0) { int64_t tbl_over; - if (arc_mfu->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); - (void) arc_evict(arc_mfu, toevict, FALSE, - ARC_BUFC_UNDEF); + if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_DATA); + arc_over = arc_size - arc_c; + } + + if (arc_over > 0 && + arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], + arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_METADATA); } - tbl_over = arc_size + arc_mru_ghost->arcs_lsize + - arc_mfu_ghost->arcs_lsize - arc_c*2; + tbl_over = arc_size + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c * 2; - if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); - arc_evict_ghost(arc_mfu_ghost, todelete); + if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { + todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); } } } @@ -1314,7 +1714,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; + rw_enter(&buf->b_lock, RW_WRITER); buf->b_hdr = NULL; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -1329,24 +1731,40 @@ arc_do_user_evicts(void) } /* - * Flush all *evictable* data from the cache. + * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void -arc_flush(void) +arc_flush(spa_t *spa) { - while (list_head(&arc_mru->arcs_list)) - (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); - while (list_head(&arc_mfu->arcs_list)) - (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } - arc_evict_ghost(arc_mru_ghost, -1); - arc_evict_ghost(arc_mfu_ghost, -1); + arc_evict_ghost(arc_mru_ghost, spa, -1); + arc_evict_ghost(arc_mfu_ghost, spa, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); - ASSERT(arc_eviction_list == NULL); + ASSERT(spa || arc_eviction_list == NULL); } int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ @@ -1380,7 +1798,7 @@ arc_shrink(void) arc_adjust(); } -static int zfs_needfree = 0; +static int needfree = 0; static int arc_reclaim_needed(void) @@ -1391,13 +1809,28 @@ arc_reclaim_needed(void) #ifdef _KERNEL - if (zfs_needfree) + if (needfree) return (1); #if 0 /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* * check to make sure that swapfs has enough space so that anon - * reservations can still succeeed. anon_resvmem() checks that the + * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. @@ -1405,23 +1838,6 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); - /* - * If zio data pages are being allocated out of a separate heap segment, - * then check that the size of available vmem for this area remains - * above 1/4th free. This needs to be done when the size of the - * non-default segment is smaller than physical memory, so we could - * conceivably run out of VA in that segment before running out of - * physical memory. - */ - if (zio_arena != NULL) { - size_t arc_ziosize = - btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); - - if ((physmem > arc_ziosize) && - (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) - return (1); - } - #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -1431,7 +1847,7 @@ arc_reclaim_needed(void) * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < @@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif #ifdef _KERNEL - /* - * First purge some DNLC entries, in case the DNLC is using - * up too much memory. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. @@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif /* - * An agressive reclamation will shrink the cache size as well as + * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) @@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused) /* reset the growth delay for every reclaim */ growtime = LBOLT + (arc_grow_retry * hz); - ASSERT(growtime > 0); - if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { + if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* - * If zfs_needfree is TRUE our vm_lowmem hook + * If needfree is TRUE our vm_lowmem hook * was called and in that case we must free some * memory, so switch to aggressive mode. */ @@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused) last_reclaim = ARC_RECLAIM_AGGR; } arc_kmem_reap_now(last_reclaim); - } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) { + arc_warm = B_TRUE; + + } else if (arc_no_grow && LBOLT >= growtime) { arc_no_grow = FALSE; } - if (zfs_needfree || + if (needfree || (2 * arc_c < arc_size + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) arc_adjust(); @@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused) arc_do_user_evicts(); if (arc_reclaim_needed()) { - zfs_needfree = 0; + needfree = 0; #ifdef _KERNEL - wakeup(&zfs_needfree); + wakeup(&needfree); #endif } @@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; + if (state == arc_l2c_only) + return; + ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state) * prior to insert. */ static int -arc_evict_needed() +arc_evict_needed(arc_buf_contents_t type) { + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#if 0 +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif +#endif + if (arc_reclaim_needed()) return (1); @@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf) * We have not yet reached cache maximum size, * just allocate a new buffer. */ - if (!arc_evict_needed()) { + if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); goto out; } @@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_p > mru_used) ? arc_mfu : arc_mru; + state = (arc_mfu->arcs_lsize[type] > 0 && + arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + state = (arc_mru->arcs_lsize[type] > 0 && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -1728,7 +2170,7 @@ out: atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize, size); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous @@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mru->arcs_mtx); - list_remove(&arc_mru->arcs_list, buf); - list_insert_head(&arc_mru->arcs_list, buf); - mutex_exit(&arc_mru->arcs_mtx); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); @@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mfu->arcs_mtx); - list_remove(&arc_mfu->arcs_list, buf); - list_insert_head(&arc_mfu->arcs_list, buf); - mutex_exit(&arc_mfu->arcs_mtx); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = LBOLT; @@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = LBOLT; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } @@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) VERIFY(arc_buf_remove_ref(buf, arg) == 1); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { @@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio) &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) - callback_list->acb_byteswap(buf->b_data, hdr->b_size); + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); /* create copies of the data buffer for the callers */ abuf = buf; @@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); - /* convert checksum errors into IO errors */ - if (zio->io_error == ECKSUM) - zio->io_error = EIO; } /* @@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + arc_buf_hdr_t *hdr = pbuf->b_hdr; + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + + ASSERT3P(hdr, ==, pbuf->b_hdr); + rw_exit(&pbuf->b_lock); + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; - zio_t *rzio; + zio_t *rzio; top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); @@ -2053,10 +2525,9 @@ top: KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, flags); + spa, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2093,6 +2564,8 @@ top: } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), @@ -2104,6 +2577,8 @@ top: } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; + vdev_t *vd = NULL; + daddr_t addr; if (hdr == NULL) { /* this block is not in the cache */ @@ -2130,6 +2605,8 @@ top: private); hdr->b_flags |= ARC_PREFETCH; } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { @@ -2144,7 +2621,9 @@ top: hdr->b_flags |= ARC_PREFETCH; else add_reference(hdr, hash_lock, private); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -2160,7 +2639,6 @@ top: acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; @@ -2176,6 +2654,18 @@ top: if (GHOST_STATE(hdr->b_state)) arc_access(hdr, hash_lock); + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); @@ -2186,8 +2676,65 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); + if (vd != NULL) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } + rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, flags, zb); + arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); return (0); - } - hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - - mutex_enter(hash_lock); - - if (buf->b_data == NULL) { + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ /* - * We are on the eviction list. + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. */ - mutex_exit(hash_lock); - mutex_enter(&arc_eviction_mtx); - if (buf->b_hdr == NULL) { - /* - * We are already in arc_do_user_evicts(). - */ - mutex_exit(&arc_eviction_mtx); - return (0); - } else { - arc_buf_t copy = *buf; /* structure assignment */ - /* - * Process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&arc_eviction_mtx); - VERIFY(copy.b_efunc(©) == 0); - return (1); - } + buf->b_efunc = NULL; + rw_exit(&buf->b_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); @@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf) arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags = ARC_IN_HASH_TABLE; + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); + rw_exit(&buf->b_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; @@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf) * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make - * make a new hdr for the buffer. + * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + rw_enter(&buf->b_lock, RW_WRITER); + hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ @@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); + rw_exit(&buf->b_lock); return; } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + /* * Do we have more than one buf? */ - if (hdr->b_buf != buf || buf->b_next != NULL) { + if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_datacnt > 1); + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this buf and attach it to * a new anonymous buf. @@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { - ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; - nhdr->b_flags = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; - mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; + rw_exit(&buf->b_lock); atomic_add_64(&arc_anon->arcs_size, blksz); - - hdr = nhdr; } else { + rw_exit(&buf->b_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; @@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag) } buf->b_efunc = NULL; buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } } int arc_released(arc_buf_t *buf) { - return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + int released; + + rw_enter(&buf->b_lock, RW_READER); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + rw_exit(&buf->b_lock); + return (released); } int arc_has_callback(arc_buf_t *buf) { - return (buf->b_efunc != NULL); + int callback; + + rw_enter(&buf->b_lock, RW_READER); + callback = (buf->b_efunc != NULL); + rw_exit(&buf->b_lock); + return (callback); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { - return (refcount_count(&buf->b_hdr->b_refcnt)); + int referenced; + + rw_enter(&buf->b_lock, RW_READER); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + rw_exit(&buf->b_lock); + return (referenced); } #endif @@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; - if (callback->awcb_ready) { - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void @@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio) hdr->b_acb = NULL; - /* this buffer is on no lists and is not in the hash table */ - ASSERT3P(hdr->b_state, ==, arc_anon); - hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = zio->io_bp->blk_birth; hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; @@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ + ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), BP_IDENTITY(zio->io_bp))); ASSERT3U(zio->io_bp_orig.blk_birth, ==, @@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio) ASSERT3P(exists, ==, NULL); } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - arc_access(hdr, hash_lock); + /* if it's not anon, we are doing a scrub */ + if (hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else if (callback->awcb_done == NULL) { int destroy_hdr; @@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio) } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } + hdr->b_flags &= ~ARC_STORED; if (callback->awcb_done) { ASSERT(!refcount_is_zero(&hdr->b_refcnt)); @@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio) kmem_free(callback, sizeof (arc_write_callback_t)); } +static void +write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) +{ + boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); + + /* Determine checksum setting */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && + !zio_checksum_table[wp->wp_oschecksum].ci_zbt) + zp->zp_checksum = wp->wp_oschecksum; + else + zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, + wp->wp_oschecksum); + } + + /* Determine compression setting */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + zp->zp_compress = zio_compress_select(wp->wp_dncompress, + wp->wp_oscompress); + } + + zp->zp_type = wp->wp_type; + zp->zp_level = wp->wp_level; + zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); +} + zio_t * -arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, arc_buf_t *buf, +arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) + int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; - zio_t *zio; + zio_t *zio; + zio_prop_t zp; - /* this is a private buffer - no locking required */ - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(ready != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == 0); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; - hdr->b_flags |= ARC_IO_IN_PROGRESS; - zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, - buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, - priority, flags, zb); + + write_policy(spa, wp, &zp); + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } @@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * nonzero, it should match what we have in the cache. */ ASSERT(bp->blk_cksum.zc_word[0] == 0 || - ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + bp->blk_cksum.zc_word[0] == ab->b_cksum0 || + bp->blk_fill == BLK_FILL_ALREADY_FREED); + if (ab->b_state != arc_anon) arc_change_state(arc_anon, ab, hash_lock); if (HDR_IO_IN_PROGRESS(ab)) { @@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_buf->b_private = NULL; mutex_exit(hash_lock); } else if (refcount_is_zero(&ab->b_refcnt)) { + ab->b_flags |= ARC_FREE_IN_PROGRESS; mutex_exit(hash_lock); arc_hdr_destroy(ab); ARCSTAT_BUMP(arcstat_deleted); @@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } } - zio = zio_free(pio, spa, txg, bp, done, private); + zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); if (arc_flags & ARC_WAIT) return (zio_wait(zio)); @@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, return (0); } +static int +arc_memory_throttle(uint64_t reserve, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t inflight_data = arc_anon->arcs_size; + uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if 0 +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == pageproc) { + if (page_load > available_memory / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + void -arc_tempreserve_clear(uint64_t tempreserve) +arc_tempreserve_clear(uint64_t reserve) { - atomic_add_64(&arc_tempreserve, -tempreserve); + atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int -arc_tempreserve_space(uint64_t tempreserve) +arc_tempreserve_space(uint64_t reserve, uint64_t txg) { + int error; + #ifdef ZFS_DEBUG /* * Once in a while, fail for no reason. Everything should cope. @@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve) return (ERESTART); } #endif - if (tempreserve > arc_c/4 && !arc_no_grow) - arc_c = MIN(arc_c_max, tempreserve * 4); - if (tempreserve > arc_c) + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) return (ENOMEM); /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, txg)) + return (error); + + /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. - * - * XXX The limit should be adjusted dynamically to keep the time - * to sync a dataset fixed (around 1-5 seconds?). */ - - if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { - dprintf("failing, arc_tempreserve=%lluK anon=%lluK " - "tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, arc_anon->arcs_lsize>>10, - tempreserve>>10, arc_c>>10); + if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); return (ERESTART); } - atomic_add_64(&arc_tempreserve, tempreserve); + atomic_add_64(&arc_tempreserve, reserve); return (0); } @@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused) /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); - zfs_needfree = 1; + needfree = 1; cv_signal(&arc_reclaim_thr_cv); - while (zfs_needfree) - tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); + while (needfree) + tsleep(&needfree, 0, "zfs:lowmem", hz / 5); mutex_exit(&arc_lowmem_lock); } #endif @@ -2743,6 +3442,16 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -2757,6 +3466,7 @@ arc_init(void) arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; arc_size = 0; mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -2764,15 +3474,28 @@ arc_init(void) mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); @@ -2798,6 +3521,13 @@ arc_init(void) #endif arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef _KERNEL /* Warn about ZFS memory and address space requirements. */ @@ -2808,9 +3538,9 @@ arc_init(void) if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " + printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); + printf(" in /boot/loader.conf.\n"); } #endif } @@ -2818,6 +3548,7 @@ arc_init(void) void arc_fini(void) { + mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); @@ -2825,7 +3556,7 @@ arc_fini(void) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); - arc_flush(); + arc_flush(NULL); arc_dead = TRUE; @@ -2838,10 +3569,14 @@ arc_fini(void) mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list); - list_destroy(&arc_mru_ghost->arcs_list); - list_destroy(&arc_mfu->arcs_list); - list_destroy(&arc_mfu_ghost->arcs_list); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); mutex_destroy(&arc_anon->arcs_mtx); mutex_destroy(&arc_mru->arcs_mtx); @@ -2849,6 +3584,8 @@ arc_fini(void) mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&zfs_write_limit_lock); + buf_fini(); mutex_destroy(&arc_lowmem_lock); @@ -2857,3 +3594,985 @@ arc_fini(void) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + */ + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + int equal; + + ASSERT(zio->io_vd != NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + hdr = buf->b_hdr; + ASSERT(hdr != NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = EIO; + } + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) + zio_nowait(zio_read(zio->io_parent, + cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static void +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz, headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + int try; + + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + headroom = target_sz * l2arc_headroom; + if (arc_warm == B_FALSE) + ab = list_head(list); + else + ab = list_tail(list); + + for (; ab; ab = ab_prev) { + if (arc_warm == B_FALSE) + ab_prev = list_next(list, ab); + else + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (ab->b_spa != spa) { + mutex_exit(hash_lock); + continue; + } + + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + continue; + } + + if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (ab->b_buf == NULL) { + DTRACE_PROBE1(l2arc__buf__null, void *, ab); + mutex_exit(hash_lock); + continue; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + /* + * Keep the clock hand suitably device-aligned. + */ + buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return; + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + spa_l2cache_space_update(dev->l2ad_vdev, 0, + dev->l2ad_end - dev->l2ad_hand); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + (void) zio_wait(pio); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size; + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + /* + * Pause for l2arc_feed_secs seconds between writes. + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + hz * l2arc_feed_secs); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = dev->l2ad_write; + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + l2arc_write_buffers(spa, dev, size); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_boost = l2arc_write_boost; + adddev->l2ad_start = start; + adddev->l2ad_end = end; + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c index 4442b1f28ac8..93b7741d77be 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/bplist.h> #include <sys/zfs_context.h> @@ -47,7 +45,7 @@ bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) { int size; - size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ? + size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? BPLIST_SIZE_V0 : sizeof (bplist_phys_t); return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, @@ -181,7 +179,7 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) } int -bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) +bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) { uint64_t blk, off; blkptr_t *bparray; @@ -229,7 +227,7 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) * Deferred entry; will be written later by bplist_sync(). */ void -bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp) +bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) { bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); @@ -278,9 +276,7 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) int bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - uint64_t itor = 0, comp = 0, uncomp = 0; int err; - blkptr_t bp; mutex_enter(&bpl->bpl_lock); @@ -298,6 +294,9 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) mutex_exit(&bpl->bpl_lock); if (!bpl->bpl_havecomp) { + uint64_t itor = 0, comp = 0, uncomp = 0; + blkptr_t bp; + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { comp += BP_GET_PSIZE(&bp); uncomp += BP_GET_UCSIZE(&bp); @@ -310,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) return (err); } + +/* + * Return (in *dasizep) the amount of space on the deadlist which is: + * mintxg < blk_birth <= maxtxg + */ +int +bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *dasizep) +{ + uint64_t size = 0; + uint64_t itor = 0; + blkptr_t bp; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpl_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err == 0) + *dasizep = bpl->bpl_phys->bpl_bytes; + mutex_exit(&bpl->bpl_lock); + return (err); + } + + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { + size += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); + } + } + if (err == ENOENT) + err = 0; + *dasizep = size; + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 94c63081478a..2494c1e7f9d1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dmu.h> #include <sys/dmu_impl.h> @@ -39,17 +37,10 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static arc_done_func_t dbuf_write_ready; static arc_done_func_t dbuf_write_done; -int zfs_mdcomp_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); - /* * Global data structures and functions for the dbuf cache. */ @@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db) } if (db->db_blkid == DB_BONUS_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - blkptr_t *bp; + dnode_t *dn = db->db_dnode; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + int bonuslen = dn->dn_bonuslen; + + ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - if (db->db.db_size < DN_MAX_BONUSLEN) + arc_space_consume(DN_MAX_BONUSLEN); + if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, - db->db.db_size); + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } - if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) - bp = NULL; - else - bp = db->db_blkptr; - - if (bp == NULL) - dprintf_dbuf(db, "blkptr: %s\n", "NULL"); - else - dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); - - if (bp == NULL || BP_IS_HOLE(bp)) { + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - ASSERT(bp == NULL || BP_IS_HOLE(bp)); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; @@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) db->db_state = DB_READ; mutex_exit(&db->db_mtx); + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + zb.zb_objset = db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : 0; zb.zb_object = db->db.db_object; @@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); - (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, - db->db_level > 0 ? byteswap_uint64_array : - dmu_ot[db->db_dnode->dn_type].ot_byteswap, + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { @@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* free this block */ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { /* XXX can get silent EIO here */ - (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + (void) dsl_free(NULL, + spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) arc_release(dr->dt.dl.dr_data, db); } +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ void -dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; - dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + if (end > dn->dn_maxblkid) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < blkid || - db->db_blkid >= blkid+nblks) + if (db->db_blkid < start || db->db_blkid > end) continue; /* found a level 0 buffer in the range */ @@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } static int -dbuf_new_block(dmu_buf_impl_t *db) +dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; - /* Don't count meta-objects */ - if (ds == NULL) - return (FALSE); - /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - /* If we have been dirtied since the last snapshot, its not new */ if (db->db_last_dirty) birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; + /* If we don't exist or are in a snapshot, we can't be freed */ if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg)); + return (ds == NULL || + dsl_dataset_block_freeable(ds, birth_txg)); else - return (TRUE); + return (FALSE); } void @@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); @@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while (*drp && (*drp)->dr_txg > tx->tx_txg) - drp = &(*drp)->dr_next; - if (*drp && (*drp)->dr_txg == tx->tx_txg) { + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ - dbuf_unoverride(*drp); + dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); - return (*drp); + return (dr); } /* @@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dasize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this @@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_freed_in_flight = FALSE; } - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * Update the accounting. - */ - if (!dbuf_new_block(db) && db->db_blkptr) { - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - dnode_willuse_space(dn, - -bp_get_dasize(os->os_spa, db->db_blkptr), tx); - } - dnode_willuse_space(dn, db->db.db_size, tx); - } - /* * This buffer is now part of this txg */ @@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); return (dr); - } - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx); - ASSERT(dn->dn_maxblkid >= db->db_blkid); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dasize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { @@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drop_struct_lock = TRUE; } + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; @@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - for (dr = db->db_last_dirty; dr; dr = dr->dr_next) + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) { @@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* XXX would be nice to fix up dn_towrite_space[] */ - db->db_last_dirty = dr->dr_next; + *drp = dr->dr_next; if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_level+1 == dn->dn_nlevels) { - ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); @@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_buf != NULL); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - int rf = DB_RF_MUST_SUCCEED; + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) + if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } @@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); dnode_rele(dn, db); + db->db_dnode = NULL; } if (db->db_buf) @@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, if (blkid == DB_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = dn->dn_bonuslen; + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t)); return (db); } else { int blocksize = @@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t)); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DB_BONUS_BLKID) { - dnode_t *dn = db->db_dnode; - /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (list_link_active(&db->db_link)) { + if (db->db_dnode) { + dnode_t *dn = db->db_dnode; + mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); mutex_exit(&dn->dn_dbufs_mtx); dnode_rele(dn, db); + db->db_dnode = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; - db->db_dnode = NULL; db->db_buf = NULL; + ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t)); } void @@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + arc_buf_t *pbuf; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; zb.zb_objset = dn->dn_objset->os_dsl_dataset ? @@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) zb.zb_level = 0; zb.zb_blkid = blkid; - (void) arc_read(NULL, dn->dn_objset->os_spa, bp, - dmu_ot[dn->dn_type].ot_byteswap, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) arc_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } @@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) return (err ? NULL : db); } -dmu_buf_impl_t * +void dbuf_create_bonus(dnode_t *dn) { - dmu_buf_impl_t *db = dn->dn_bonus; - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); - return (db); + dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - mutex_exit(&db->db_mtx); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); @@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) db->db_data_pending = dr; - arc_release(db->db_buf, db); mutex_exit(&db->db_mtx); - - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, - zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); + dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); @@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - int checksum, compress; int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ if (db->db_blkid == DB_BONUS_BLKID) { dbuf_dirty_record_t **drp; - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ + ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - if (*datap != db->db.db_data) + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; - ASSERT((*drp)->dr_next == NULL); - *drp = NULL; + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); @@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* * If this buffer is in the middle of an immdiate write, * wait for the synchronous IO to complete. */ @@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - dbuf_check_blkptr(dn, db); - /* * If this dbuf has already been written out via an immediate write, * just complete the write by copying over the new block pointer and @@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio_fake.io_bp = db->db_blkptr; zio_fake.io_bp_orig = *db->db_blkptr; zio_fake.io_txg = txg; + zio_fake.io_flags = 0; *db->db_blkptr = dr->dt.dl.dr_overridden_by; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dr->dr_zio = &zio_fake; mutex_exit(&db->db_mtx); + ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), + BP_IDENTITY(&zio_fake.io_bp_orig)) || + BP_IS_HOLE(zio_fake.io_bp)); + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio_fake.io_bp_orig, dn->dn_zio, tx); dbuf_write_ready(&zio_fake, db->db_buf, db); @@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } - } else { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(db->db_buf, db); } ASSERT(*datap != NULL); @@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_exit(&db->db_mtx); - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } - - dbuf_write(dr, *datap, checksum, compress, tx); + dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) @@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) } static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx) +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = db->db_dnode; @@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; - int zio_flags; + + if (!BP_IS_HOLE(db->db_blkptr) && + (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(data, db); + } else { + ASSERT(arc_released(data)); + /* XXX why do we need to thaw here? */ + arc_buf_thaw(data); + } if (parent != dn->dn_dbuf) { ASSERT(parent && parent->db_data_pending); @@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; + wp.wp_type = dn->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dncompress = dn->dn_compress; + wp.wp_oscompress = os->os_compress; + wp.wp_dnchecksum = dn->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + if (BP_IS_OLDER(db->db_blkptr, txg)) - dsl_dataset_block_kill( + (void) dsl_dataset_block_kill( os->os_dsl_dataset, db->db_blkptr, zio, tx); - dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, - dmu_get_replication_level(os, &zb, dn->dn_type), txg, - db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + dr->dr_zio = arc_write(zio, os->os_spa, &wp, + DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, + data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } /* ARGSUSED */ @@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_impl_t *db = vdb; dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; + blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t fill = 0; int old_size, new_size, i; + ASSERT(db->db_blkptr == bp); + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, zio->io_bp); + new_size = bp_get_dasize(os->os_spa, bp); - dnode_diduse_space(dn, new_size-old_size); + dnode_diduse_space(dn, new_size - old_size); - if (BP_IS_HOLE(zio->io_bp)) { + if (BP_IS_HOLE(bp)) { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + ASSERT3U(bp->blk_fill, ==, 0); return; } + ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + mutex_enter(&db->db_mtx); if (db->db_level == 0) { @@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill = 1; } } else { - blkptr_t *bp = db->db.db_data; + blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { - if (BP_IS_HOLE(bp)) + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) continue; - ASSERT3U(BP_GET_LSIZE(bp), ==, + ASSERT3U(BP_GET_LSIZE(ibp), ==, db->db_level == 1 ? dn->dn_datablksz : (1<<dn->dn_phys->dn_indblkshift)); - fill += bp->blk_fill; + fill += ibp->blk_fill; } } - db->db_blkptr->blk_fill = fill; - BP_SET_TYPE(db->db_blkptr, dn->dn_type); - BP_SET_LEVEL(db->db_blkptr, db->db_level); + bp->blk_fill = fill; mutex_exit(&db->db_mtx); - /* We must do this after we've set the bp's type and level */ - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - dsl_dataset_block_born(ds, zio->io_bp, tx); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + dsl_dataset_block_born(ds, bp, tx); } } @@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_enter(&db->db_mtx); drp = &db->db_last_dirty; - while (*drp != db->db_data_pending) - drp = &(*drp)->dr_next; - ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); - ASSERT((*drp)->dr_txg == txg); - ASSERT((*drp)->dr_next == NULL); - dr = *drp; - *drp = NULL; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (db->db_level == 0) { ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index d3be6b4ff22e..377efb9d105e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> @@ -42,6 +40,7 @@ #include <sys/zfs_ioctl.h> #include <sys/zap.h> #include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "DSL props" }, { byteswap_uint64_array, TRUE, "DSL dataset" }, { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, { byteswap_uint8_array, FALSE, "ZFS plain file" }, { zap_byteswap, TRUE, "ZFS directory" }, { zap_byteswap, TRUE, "ZFS master node" }, @@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "persistent error log" }, { byteswap_uint8_array, TRUE, "SPA history" }, { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "DSL permissions" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, + { byteswap_uint8_array, TRUE, "FUID table" }, + { byteswap_uint64_array, TRUE, "FUID table size" }, + { zap_byteswap, TRUE, "DSL dataset next clones"}, + { zap_byteswap, TRUE, "scrub work queue" }, }; int @@ -115,6 +121,19 @@ dmu_bonus_max(void) return (DN_MAX_BONUSLEN); } +int +dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + if (dn->dn_bonus != (dmu_buf_impl_t *)db) + return (EINVAL); + if (newsize < 0 || newsize > db->db_size) + return (EINVAL); + dnode_setbonuslen(dn, newsize, tx); + return (0); +} + /* * returns ENOENT, EIO, or 0. */ @@ -122,27 +141,27 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; - int err, count; dmu_buf_impl_t *db; + int error; - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); + error = dnode_hold(os->os, object, FTAG, &dn); + if (error) + return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); + dbuf_create_bonus(dn); } db = dn->dn_bonus; rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - count = refcount_add(&db->db_holds, tag); - mutex_exit(&db->db_mtx); - if (count == 1) - dnode_add_ref(dn, db); + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) + VERIFY(dnode_add_ref(dn, db)); + dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); @@ -161,11 +180,13 @@ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { + dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t flags; int err; zio_t *zio; + hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); @@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); @@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, /* wait for async i/o */ err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); @@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } +static int +get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +{ + uint64_t len = *offset - limit; + uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; + uint64_t subchunk = + dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT(limit <= *offset); + + if (len <= chunk_len) { + *offset = limit; + return (0); + } + + ASSERT(ISP2(subchunk)); + + while (*offset > limit) { + uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); + uint64_t delta; + int err; + + /* skip over allocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + ASSERT3U(*offset, <=, initial_offset); + *offset = P2ALIGN(*offset, subchunk); + delta = initial_offset - *offset; + if (delta >= chunk_len) { + *offset += delta - chunk_len; + return (0); + } + chunk_len -= delta; + + /* skip over unallocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + if (*offset < limit) + *offset = limit; + ASSERT3U(*offset, <, initial_offset); + } + return (0); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t free_dnode) +{ + dmu_tx_t *tx; + uint64_t object_size, start, end, len; + boolean_t trunc = (length == DMU_OBJECT_END); + int align, err; + + align = 1 << dn->dn_datablkshift; + ASSERT(align > 0); + object_size = align == 1 ? dn->dn_datablksz : + (dn->dn_maxblkid + 1) << dn->dn_datablkshift; + + if (trunc || (end = offset + length) > object_size) + end = object_size; + if (end <= offset) + return (0); + length = end - offset; + + while (length) { + start = end; + err = get_next_chunk(dn, &start, offset); + if (err) + return (err); + len = trunc ? DMU_OBJECT_END : end - start; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, start, len); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dnode_free_range(dn, start, trunc ? -1 : len, tx); + + if (start == 0 && free_dnode) { + ASSERT(trunc); + dnode_free(dn, tx); + } + + length -= end - start; + + dmu_tx_commit(tx); + end = start; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_object(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_tx_t *tx; + int err; + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err != 0) + return (err); + if (dn->dn_nlevels == 1) { + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } else { + err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + } + dnode_rele(dn, FTAG); + return (err); +} + int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) @@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int err; /* * NB: we could do this block-at-a-time, but it's nice @@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp); if (err) - return (err); + break; for (i = 0; i < numbufs; i++) { int tocpy; @@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); - return (0); + return (err); } void @@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); - va = ppmapin(pp, PROT_READ, (caddr_t)-1); + va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); - ppmapout(va); + zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } @@ -620,6 +796,22 @@ typedef struct { /* ARGSUSED */ static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + blkptr_t *bp = zio->io_bp; + + if (!BP_IS_HOLE(bp)) { + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } +} + +/* ARGSUSED */ +static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *in = varg; @@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) dmu_buf_impl_t *db = dr->dr_dbuf; dmu_sync_cb_t *done = in->done; - if (!BP_IS_HOLE(zio->io_bp)) { - zio->io_bp->blk_fill = 1; - BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); - BP_SET_LEVEL(zio->io_bp, 0); - } - mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ @@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake, dbuf_dirty_record_t *dr; dmu_sync_arg_t *in; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; - int zio_flags; int err; ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); - dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); @@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake, zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - zio = arc_write(pio, os->os_spa, - zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), - zio_compress_select(db->db_dnode->dn_compress, os->os_compress), - dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), - txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); + + wp.wp_type = db->db_dnode->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dnchecksum = db->db_dnode->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + wp.wp_dncompress = db->db_dnode->dn_compress; + wp.wp_oscompress = os->os_compress; + + ASSERT(BP_IS_HOLE(bp)); + + zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), + txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); if (pio) { zio_nowait(zio); @@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, } int -dmu_get_replication_level(objset_impl_t *os, - zbookmark_t *zb, dmu_object_type_t ot) -{ - int ncopies = os->os_copies; - - /* If it's the mos, it should have max copies set. */ - ASSERT(zb->zb_objset != 0 || - ncopies == spa_max_replication(os->os_spa)); - - if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) - ncopies++; - return (MIN(ncopies, spa_max_replication(os->os_spa))); -} - -int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; @@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) return (err); } - err = dnode_next_offset(dn, hole, off, 1, 1, 0); + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); @@ -1018,6 +1193,7 @@ dmu_init(void) dbuf_init(); dnode_init(); arc_init(); + l2arc_init(); } void @@ -1026,4 +1202,5 @@ dmu_fini(void) arc_fini(); dnode_fini(); dbuf_fini(); + l2arc_fini(); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 93168cc8901f..1b9247d66e65 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,7 +54,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; int error = dnode_next_offset(osi->os_meta_dnode, - B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); + DNODE_FIND_HOLE, + &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; @@ -139,6 +140,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); @@ -152,7 +154,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) int error; error = dnode_next_offset(os->os->os_meta_dnode, - hole, &offset, 0, DNODES_PER_BLOCK, txg); + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 378fe8c15bc0..7981e06825c4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include <sys/cred.h> #include <sys/zfs_context.h> #include <sys/dmu_objset.h> #include <sys/dsl_dir.h> @@ -32,6 +31,7 @@ #include <sys/dsl_prop.h> #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> #include <sys/dnode.h> #include <sys/dbuf.h> #include <sys/zvol.h> @@ -40,7 +40,7 @@ #include <sys/zap.h> #include <sys/zil.h> #include <sys/dmu_impl.h> - +#include <sys/zfs_ioctl.h> spa_t * dmu_objset_spa(objset_t *os) @@ -131,6 +131,34 @@ copies_changed_cb(void *arg, uint64_t newval) osi->os_copies = newval; } +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_secondary_cache = newval; +} + void dmu_objset_byteswap(void *buf, size_t size) { @@ -146,8 +174,10 @@ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_impl_t **osip) { - objset_impl_t *winner, *osi; - int i, err, checksum; + objset_impl_t *osi; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); osi->os.os = osi; @@ -161,18 +191,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zb.zb_object = 0; zb.zb_level = -1; zb.zb_blkid = 0; + if (DMU_OS_IS_L2CACHEABLE(osi)) + aflags |= ARC_L2CACHE; dprintf_bp(osi->os_rootbp, "reading %s", ""); - err = arc_read(NULL, spa, osi->os_rootbp, - dmu_ot[DMU_OT_OBJSET].ot_byteswap, + /* + * NB: when bprewrite scrub can change the bp, + * and this is called from dmu_objset_open_ds_os, the bp + * could change, and we'll need a lock. + */ + err = arc_read_nolock(NULL, spa, osi->os_rootbp, arc_getbuf_func, &osi->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { kmem_free(osi, sizeof (objset_impl_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; return (err); } osi->os_phys = osi->os_phys_buf->b_data; - arc_release(osi->os_phys_buf, &osi->os_phys_buf); } else { osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &osi->os_phys_buf, ARC_BUFC_METADATA); @@ -183,18 +221,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the - * default (fletcher2/off). Snapshots don't need to know, and - * registering would complicate clone promotion. + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. */ - if (ds && ds->ds_phys->ds_num_children == 0) { - err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); - if (err == 0) - err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); + if (ds) { + err = dsl_prop_register(ds, "primarycache", + primary_cache_changed_cb, osi); if (err == 0) - err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); + err = dsl_prop_register(ds, "secondarycache", + secondary_cache_changed_cb, osi); + if (!dsl_dataset_is_snapshot(ds)) { + if (err == 0) + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "copies", + copies_changed_cb, osi); + } if (err) { VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); @@ -206,24 +252,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; osi->os_compress = ZIO_COMPRESS_LZJB; osi->os_copies = spa_max_replication(spa); + osi->os_primary_cache = ZFS_CACHE_ALL; + osi->os_secondary_cache = ZFS_CACHE_ALL; } - osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); - - /* - * Metadata always gets compressed and checksummed. - * If the data checksum is multi-bit correctable, and it's not - * a ZBT-style checksum, then it's suitable for metadata as well. - * Otherwise, the metadata checksum defaults to fletcher4. - */ - checksum = osi->os_checksum; - - if (zio_checksum_table[checksum].ci_correctable && - !zio_checksum_table[checksum].ci_zbt) - osi->os_md_checksum = checksum; - else - osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_md_compress = ZIO_COMPRESS_LZJB; + osi->os_zil_header = osi->os_phys->os_zil_header; + osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), @@ -238,70 +272,118 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); osi->os_meta_dnode = dnode_special_open(osi, &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); - if (ds != NULL) { - winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict); - if (winner) { - dmu_objset_evict(ds, osi); - osi = winner; - } + /* + * We should be the only thread trying to do this because we + * have ds_opening_lock + */ + if (ds) { + VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, + dmu_objset_evict)); } *osip = osi; return (0); } -/* called from zpl */ -int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) +static int +dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) { - dsl_dataset_t *ds; - int err; - objset_t *os; objset_impl_t *osi; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dsl_dataset_open(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); - return (err); - } - + mutex_enter(&ds->ds_opening_lock); osi = dsl_dataset_get_user_ptr(ds); if (osi == NULL) { + int err; + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &ds->ds_phys->ds_bp, &osi); if (err) { - dsl_dataset_close(ds, mode, os); - kmem_free(os, sizeof (objset_t)); + mutex_exit(&ds->ds_opening_lock); return (err); } } + mutex_exit(&ds->ds_opening_lock); os->os = osi; - os->os_mode = mode; + os->os_mode = DS_MODE_NOHOLD; - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) { - dmu_objset_close(os); + if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) return (EINVAL); - } - *osp = os; return (0); } +int +dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +{ + objset_t *os; + int err; + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + err = dmu_objset_open_ds_os(ds, os, type); + if (err) + kmem_free(os, sizeof (objset_t)); + else + *osp = os; + return (err); +} + +/* called from zpl */ +int +dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp) +{ + objset_t *os; + dsl_dataset_t *ds; + int err; + + ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || + DS_MODE_TYPE(mode) == DS_MODE_OWNER); + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + err = dsl_dataset_hold(name, os, &ds); + else + err = dsl_dataset_own(name, mode, os, &ds); + if (err) { + kmem_free(os, sizeof (objset_t)); + return (err); + } + + err = dmu_objset_open_ds_os(ds, os, type); + if (err) { + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + dsl_dataset_rele(ds, os); + else + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); + } else { + os->os_mode = mode; + *osp = os; + } + return (err); +} + void dmu_objset_close(objset_t *os) { - dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os); + ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); + + if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) + dsl_dataset_rele(os->os->os_dsl_dataset, os); + else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) + dsl_dataset_disown(os->os->os_dsl_dataset, os); kmem_free(os, sizeof (objset_t)); } int -dmu_objset_evict_dbufs(objset_t *os, int try) +dmu_objset_evict_dbufs(objset_t *os) { objset_impl_t *osi = os->os; dnode_t *dn; @@ -319,34 +401,25 @@ dmu_objset_evict_dbufs(objset_t *os, int try) * skip. */ for (dn = list_head(&osi->os_dnodes); - dn && refcount_is_zero(&dn->dn_holds); + dn && !dnode_add_ref(dn, FTAG); dn = list_next(&osi->os_dnodes, dn)) continue; - if (dn) - dnode_add_ref(dn, FTAG); while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&osi->os_dnodes, next_dn); - } while (next_dn && refcount_is_zero(&next_dn->dn_holds)); - if (next_dn) - dnode_add_ref(next_dn, FTAG); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&osi->os_lock); - if (dnode_evict_dbufs(dn, try)) { - dnode_rele(dn, FTAG); - if (next_dn) - dnode_rele(next_dn, FTAG); - return (1); - } + dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&osi->os_lock); dn = next_dn; } mutex_exit(&osi->os_lock); - return (0); + return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); } void @@ -361,13 +434,19 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); } - if (ds && ds->ds_phys->ds_num_children == 0) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); + if (ds) { + if (!dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "compression", + compression_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "copies", + copies_changed_cb, osi)); + } + VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + primary_cache_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + secondary_cache_changed_cb, osi)); } /* @@ -375,7 +454,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) * nothing can be added to the list at this point. */ os.os = osi; - (void) dmu_objset_evict_dbufs(&os, 0); + (void) dmu_objset_evict_dbufs(&os); ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); @@ -387,6 +466,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); + mutex_destroy(&osi->os_user_ptr_lock); kmem_free(osi, sizeof (objset_impl_t)); } @@ -399,7 +479,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); + if (ds) + mutex_enter(&ds->ds_opening_lock); VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); + if (ds) + mutex_exit(&ds->ds_opening_lock); mdn = osi->os_meta_dnode; dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, @@ -443,14 +527,15 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } struct oscarg { - void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; dsl_dataset_t *clone_parent; const char *lastname; dmu_objset_type_t type; + uint64_t flags; }; -/* ARGSUSED */ +/*ARGSUSED*/ static int dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -478,11 +563,12 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) if (oa->clone_parent->ds_phys->ds_num_children == 0) return (EINVAL); } + return (0); } static void -dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; @@ -493,10 +579,9 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, tx); + oa->clone_parent, oa->flags, cr, tx); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); if (BP_IS_HOLE(bp)) { objset_impl_t *osi; @@ -506,15 +591,19 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, tx); + oa->userfunc(&osi->os, oa->userarg, cr, tx); } - dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); + + spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dsobj); + + dsl_dataset_rele(ds, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg) + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; const char *tail; @@ -536,6 +625,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, oa.userarg = arg; oa.lastname = tail; oa.type = type; + oa.flags = flags; + if (clone_parent != NULL) { /* * You can't clone to a different type. @@ -564,33 +655,47 @@ dmu_objset_destroy(const char *name) * It would be nicer to do this in dsl_dataset_destroy_sync(), * but the replay log objset is modified in open context. */ - error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); + error = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); if (error == 0) { + dsl_dataset_t *ds = os->os->os_dsl_dataset; zil_destroy(dmu_objset_zil(os), B_FALSE); - dmu_objset_close(os); + + error = dsl_dataset_destroy(ds, os); + /* + * dsl_dataset_destroy() closes the ds. + */ + kmem_free(os, sizeof (objset_t)); } - return (dsl_dataset_destroy(name)); + return (error); } +/* + * This will close the objset. + */ int -dmu_objset_rollback(const char *name) +dmu_objset_rollback(objset_t *os) { int err; - objset_t *os; + dsl_dataset_t *ds; - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err == 0) { - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) - zil_resume(dmu_objset_zil(os)); - if (err == 0) { - /* XXX uncache everything? */ - err = dsl_dataset_rollback(os->os->os_dsl_dataset); - } + ds = os->os->os_dsl_dataset; + + if (!dsl_dataset_tryown(ds, TRUE, os)) { dmu_objset_close(os); + return (EBUSY); } + + err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); + + /* + * NB: we close the objset manually because the rollback + * actually implicitly called dmu_objset_evict(), thus freeing + * the objset_impl_t. + */ + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); return (err); } @@ -598,6 +703,13 @@ struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; char failed[MAXPATHLEN]; + boolean_t checkperms; + list_t objsets; +}; + +struct osnode { + list_node_t node; + objset_t *os; }; static int @@ -605,20 +717,25 @@ dmu_objset_snapshot_one(char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; - dmu_objset_stats_t stat; int err; (void) strcpy(sn->failed, name); - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os); + /* + * Check permissions only when requested. This only applies when + * doing a recursive snapshot. The permission checks for the starting + * dataset have already been performed in zfs_secpolicy_snapshot() + */ + if (sn->checkperms == B_TRUE && + (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + return (err); + + err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); if (err != 0) return (err); - /* - * If the objset is in an inconsistent state, return busy. - */ - dmu_objset_fast_stat(os, &stat); - if (stat.dds_inconsistent) { + /* If the objset is in an inconsistent state, return busy */ + if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { dmu_objset_close(os); return (EBUSY); } @@ -630,8 +747,13 @@ dmu_objset_snapshot_one(char *name, void *arg) */ err = zil_suspend(dmu_objset_zil(os)); if (err == 0) { + struct osnode *osn; dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os, sn->snapname, 3); + dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, + sn->snapname, 3); + osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); + osn->os = os; + list_insert_tail(&sn->objsets, osn); } else { dmu_objset_close(os); } @@ -643,31 +765,28 @@ int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) { dsl_sync_task_t *dst; + struct osnode *osn; struct snaparg sn = { 0 }; - char *cp; spa_t *spa; int err; (void) strcpy(sn.failed, fsname); - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) return (err); sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; + list_create(&sn.objsets, sizeof (struct osnode), + offsetof(struct osnode, node)); if (recursive) { + sn.checkperms = B_TRUE; err = dmu_objset_find(fsname, dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); } else { + sn.checkperms = B_FALSE; err = dmu_objset_snapshot_one(fsname, &sn); } @@ -678,13 +797,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) - dmu_objset_name(os, sn.failed); - zil_resume(dmu_objset_zil(os)); - dmu_objset_close(os); + dsl_dataset_name(ds, sn.failed); } + out: + while (osn = list_head(&sn.objsets)) { + list_remove(&sn.objsets, osn); + zil_resume(dmu_objset_zil(osn->os)); + dmu_objset_close(osn->os); + kmem_free(osn, sizeof (struct osnode)); + } + list_destroy(&sn.objsets); + if (err) (void) strcpy(fsname, sn.failed); dsl_sync_task_group_destroy(sn.dstg); @@ -717,39 +843,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) static void ready(zio_t *zio, arc_buf_t *abuf, void *arg) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; objset_impl_t *os = arg; - blkptr_t *bp = os->os_rootbp; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - int i; + + ASSERT(bp == os->os_rootbp); + ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); + ASSERT(BP_GET_LEVEL(bp) == 0); /* * Update rootbp fill count. */ bp->blk_fill = 1; /* count the meta-dnode */ - for (i = 0; i < dnp->dn_nblkptr; i++) + for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; -} -/* ARGSUSED */ -static void -killer(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - objset_impl_t *os = arg; - - ASSERT3U(zio->io_error, ==, 0); - - BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); - BP_SET_LEVEL(zio->io_bp, 0); - - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), - BP_IDENTITY(&zio->io_bp_orig))) { + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, NULL, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, - os->os_synctx); + (void) dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, zio, os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); } - arc_release(os->os_phys_buf, &os->os_phys_buf); } /* called from dsl */ @@ -758,10 +875,10 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; - int zio_flags; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -783,19 +900,24 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) */ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; zb.zb_object = 0; - zb.zb_level = -1; + zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ zb.zb_blkid = 0; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + + wp.wp_type = DMU_OT_OBJSET; + wp.wp_level = 0; /* on-disk BP level; see above */ + wp.wp_copies = os->os_copies; + wp.wp_oschecksum = os->os_checksum; + wp.wp_oscompress = os->os_compress; + + if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { + (void) dsl_dataset_block_kill(os->os_dsl_dataset, os->os_rootbp, pio, tx); - zio = arc_write(pio, os->os_spa, os->os_md_checksum, - os->os_md_compress, - dmu_get_replication_level(os, &zb, DMU_OT_OBJSET), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + } + + arc_release(os->os_phys_buf, &os->os_phys_buf); + zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), + tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync meta-dnode - the parent IO for the sync is the root block @@ -819,6 +941,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } @@ -867,8 +990,23 @@ dmu_objset_is_snapshot(objset_t *os) } int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + uint64_t ignored; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, + real, maxlen, conflict)); +} + +int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp) + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os->os_dsl_dataset; zap_cursor_t cursor; @@ -894,6 +1032,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); @@ -938,48 +1078,80 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (0); } +struct findarg { + int (*func)(char *, void *); + void *arg; +}; + +/* ARGSUSED */ +static int +findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct findarg *fa = arg; + return (fa->func((char *)dsname, fa->arg)); +} + /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * Perhaps change all callers to use dmu_objset_find_spa()? */ int dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) { + struct findarg fa; + fa.func = func; + fa.arg = arg; + return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); +} + +/* + * Find all objsets under name, call func on each + */ +int +dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +{ dsl_dir_t *dd; - objset_t *os; - uint64_t snapobj; + dsl_pool_t *dp; + dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; - int do_self, err; + uint64_t thisobj; + int err; - err = dsl_dir_open(name, FTAG, &dd, NULL); + if (name == NULL) + name = spa_name(spa); + err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); if (err) return (err); - /* NB: the $MOS dir doesn't have a head dataset */ - do_self = (dd->dd_phys->dd_head_dataset_obj != 0); + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_close(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + dp = dd->dd_pool; /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, + for (zap_cursor_init(&zc, dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - /* - * No separating '/' because parent's name ends in /. - */ child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); + (void) strcpy(child, name); (void) strcat(child, "/"); (void) strcat(child, attr->za_name); - err = dmu_objset_find(child, func, arg, flags); + err = dmu_objset_find_spa(spa, child, func, arg, flags); kmem_free(child, MAXPATHLEN); if (err) break; @@ -996,30 +1168,36 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) /* * Iterate over all snapshots. */ - if ((flags & DS_FIND_SNAPSHOTS) && - dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { - - snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj; - dmu_objset_close(os); - - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + if (flags & DS_FIND_SNAPSHOTS) { + if (!dsl_pool_sync_context(dp)) + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (!dsl_pool_sync_context(dp)) + rw_exit(&dp->dp_config_rwlock); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); - err = func(child, arg); - kmem_free(child, MAXPATHLEN); - if (err) - break; + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == + sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strcpy(child, name); + (void) strcat(child, "@"); + (void) strcat(child, attr->za_name); + err = func(spa, attr->za_first_integer, + child, arg); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); } - zap_cursor_fini(&zc); } dsl_dir_close(dd, FTAG); @@ -1031,7 +1209,20 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) /* * Apply to self if appropriate. */ - if (do_self) - err = func(name, arg); + err = func(spa, thisobj, name, arg); return (err); } + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + os->os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + return (os->os->os_user_ptr); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 3e55dc301620..1294581a7133 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,10 +41,13 @@ #include <sys/zap.h> #include <sys/zio_checksum.h> +static char *dmu_recv_tag = "dmu_recv_tag"; + struct backuparg { dmu_replay_record_t *drr; kthread_t *td; struct file *fp; + offset_t *off; objset_t *os; zio_cksum_t zc; int err; @@ -77,6 +80,7 @@ dump_bytes(struct backuparg *ba, void *buf, int len) fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); ba->err = EOPNOTSUPP; #endif + *ba->off += len; return (ba->err); } @@ -179,7 +183,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) void *data = bc->bc_data; int err = 0; - if (SIGPENDING(curthread)) + if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); ASSERT(data || bp == NULL); @@ -215,10 +219,9 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) zb.zb_object = object; zb.zb_level = level; zb.zb_blkid = blkid; - (void) arc_read(NULL, spa, bp, - dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); + (void) arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); if (abuf) { err = dump_data(ba, type, object, blkid * blksz, @@ -236,13 +239,15 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) } int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + struct file *fp, offset_t *off) { dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; dmu_replay_record_t *drr; struct backuparg ba; int err; + uint64_t fromtxg = 0; /* tosnap must be a snapshot */ if (ds->ds_phys->ds_next_snap_obj == 0) @@ -250,26 +255,55 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) /* fromsnap must be an earlier snapshot from the same fs as tosnap */ if (fromds && (ds->ds_dir != fromds->ds_dir || - fromds->ds_phys->ds_creation_txg >= - ds->ds_phys->ds_creation_txg)) + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) return (EXDEV); + if (fromorigin) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + if (fromorigin) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + if (fromds) drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + if (fromds) + fromtxg = fromds->ds_phys->ds_creation_txg; + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + ba.drr = drr; ba.td = curthread; ba.fp = fp; ba.os = tosnap; + ba.off = off; ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { @@ -277,8 +311,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) return (ba.err); } - err = traverse_dsl_dataset(ds, - fromds ? fromds->ds_phys->ds_creation_txg : 0, + err = traverse_dsl_dataset(ds, fromtxg, ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, backup_cb, &ba); @@ -303,164 +336,384 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) return (0); } -struct restorearg { - int err; - int byteswap; - kthread_t *td; - struct file *fp; - char *buf; - uint64_t voff; - int buflen; /* number of valid bytes in buf */ - int bufoff; /* next offset to read */ - int bufsize; /* amount of memory allocated for buf */ - zio_cksum_t zc; +struct recvbeginsyncarg { + const char *tofs; + const char *tosnap; + dsl_dataset_t *origin; + uint64_t fromguid; + dmu_objset_type_t type; + void *tag; + boolean_t force; + uint64_t dsflags; + char clonelastname[MAXNAMELEN]; + dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ }; +static dsl_dataset_t * +recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, + cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + + /* This should always work, since we just created it */ + /* XXX - create should return an owned ds */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &ds)); + + if (type != DMU_OST_NONE) { + (void) dmu_objset_create_impl(dp->dp_spa, + ds, &ds->ds_phys->ds_bp, type, tx); + } + + spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + + return (ds); +} + /* ARGSUSED */ static int -replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct drr_begin *drrb = arg2; - const char *snapname; - int err; + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val; + int err; - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) - return (ENODEV); - /* must not have any changes since most recent snapshot */ - if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (ETXTBSY); + if (err != ENOENT) + return (err ? err : EEXIST); - /* new snapshot name must not exist */ - snapname = strrchr(drrb->drr_toname, '@'); - if (snapname == NULL) - return (EEXIST); + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } - snapname++; - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) + return (0); +} + +static void +recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, + rbsa->origin, flags, cr, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +static int +recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + + /* must be a head ds */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* must not be a clone ds */ + if (dsl_dir_is_clone(ds->ds_dir)) + return (EINVAL); + + err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + if (err) return (err); + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + return (0); } -/* ARGSUSED */ static void -replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) +recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + struct recvbeginsyncarg *rbsa = arg2; + dsl_dir_t *dd = ds->ds_dir; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* + * NB: caller must provide an extra hold on the dsl_dir_t, so it + * won't go away when dsl_dataset_destroy_sync() closes the + * dataset. + */ + dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + + dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); } /* ARGSUSED */ static int -replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - char *cp; - uint64_t val; + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; int err; + uint64_t val; - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(drrb->drr_toname, '/') + 1, - sizeof (uint64_t), 1, &val); - *cp = '@'; + /* must not have any changes since most recent snapshot */ + if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + return (ENODEV); + + /* most recent snapshot must match fromguid */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + /* temporary clone name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); + if (err == 0) + return (EEXIST); if (err != ENOENT) - return (err ? err : EEXIST); + return (err); + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); return (0); } +/* ARGSUSED */ static void -replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) +recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - char *cp; - dsl_dataset_t *ds; + dsl_dataset_t *ohds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp = ohds->ds_dir->dd_pool; + dsl_dataset_t *ods, *cds; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, - NULL, tx); - *cp = '@'; + /* create the temporary clone */ + VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj, + FTAG, &ods)); + dsobj = dsl_dataset_create_sync(ohds->ds_dir, + rbsa->clonelastname, ods, flags, cr, tx); + dsl_dataset_rele(ods, FTAG); + + /* open the temporary clone */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &cds)); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_EXCLUSIVE, FTAG, &ds)); + /* copy the refquota from the target fs to the clone */ + if (ohds->ds_quota > 0) + dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); - (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx); + rbsa->ds = cds; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); +} + +/* ARGSUSED */ +static void +recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_object); } -static int -replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) { - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; + int err = 0; + boolean_t byteswap; + struct recvbeginsyncarg rbsa; + uint64_t version; + int flags; + dsl_dataset_t *ds; - /* XXX verify that drr_toname is in dd */ + if (drrb->drr_magic == DMU_BACKUP_MAGIC) + byteswap = FALSE; + else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + byteswap = TRUE; + else + return (EINVAL); - snapname = strchr(drrb->drr_toname, '@'); - if (snapname == NULL) + rbsa.tofs = tofs; + rbsa.tosnap = tosnap; + rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.fromguid = drrb->drr_fromguid; + rbsa.type = drrb->drr_type; + rbsa.tag = FTAG; + rbsa.dsflags = 0; + version = drrb->drr_version; + flags = drrb->drr_flags; + + if (byteswap) { + rbsa.type = BSWAP_32(rbsa.type); + rbsa.fromguid = BSWAP_64(rbsa.fromguid); + version = BSWAP_64(version); + flags = BSWAP_32(flags); + } + + if (version != DMU_BACKUP_STREAM_VERSION || + rbsa.type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && origin == NULL)) return (EINVAL); - snapname++; - return (dsl_dataset_snapshot_check(os, snapname, tx)); -} + if (flags & DRR_FLAG_CI_DATA) + rbsa.dsflags = DS_FLAG_CI_DATASET; -static void -replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; - dsl_dataset_t *ds, *hds; + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drrb = drrb; + drc->drc_tosnap = tosnap; + drc->drc_force = force; - snapname = strchr(drrb->drr_toname, '@') + 1; + /* + * Process the begin in syncing context. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { + /* offline incremental receive */ + err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); + if (err) + return (err); - dsl_dataset_snapshot_sync(os, snapname, tx); + /* + * Only do the rollback if the most recent snapshot + * matches the incremental source + */ + if (force) { + if (ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_guid != + rbsa.fromguid) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (ENODEV); + } + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + rbsa.force = B_FALSE; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_offline_incremental_sync, ds, &rbsa, 1); + if (err) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = drc->drc_real_ds = ds; + } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* online incremental receive */ - /* set snapshot's creation time and guid */ - hds = os->os->os_dsl_dataset; - VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, - hds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds)); + /* tmp clone name is: tofs/%tosnap" */ + (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), + "%%%s", tosnap); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_creation_time = drrb->drr_creation_time; - ds->ds_phys->ds_guid = drrb->drr_toguid; - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err) + return (err); - dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); + rbsa.force = force; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_online_incremental_sync, ds, &rbsa, 5); + if (err) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = ds; + drc->drc_real_ds = rbsa.ds; + } else { + /* create new fs -- full backup or clone */ + dsl_dir_t *dd = NULL; + const char *tail; + + err = dsl_dir_open(tofs, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + if (!force) { + dsl_dir_close(dd, FTAG); + return (EEXIST); + } + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dataset_own_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, + DS_MODE_INCONSISTENT, FTAG, &ds); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (err) { + dsl_dir_close(dd, FTAG); + return (err); + } + + dsl_dataset_make_exclusive(ds, FTAG); + err = dsl_sync_task_do(dd->dd_pool, + recv_full_existing_check, + recv_full_existing_sync, ds, &rbsa, 5); + dsl_dataset_disown(ds, FTAG); + } else { + err = dsl_sync_task_do(dd->dd_pool, recv_full_check, + recv_full_sync, dd, &rbsa, 5); + } + dsl_dir_close(dd, FTAG); + if (err) + return (err); + drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; + drc->drc_newfs = B_TRUE; + } - dmu_buf_will_dirty(hds->ds_dbuf, tx); - hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + return (0); } +struct restorearg { + int err; + int byteswap; + kthread_t *td; + struct file *fp; + char *buf; + uint64_t voff; + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t cksum; +}; + static int restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid) { @@ -491,37 +744,31 @@ static void * restore_read(struct restorearg *ra, int len) { void *rv; + int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT3U(len % 8, ==, 0); - while (ra->buflen - ra->bufoff < len) { + while (done < len) { int resid; - int leftover = ra->buflen - ra->bufoff; - (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); + ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, + len - done, ra->voff, &resid); - ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover, - ra->bufsize - leftover, ra->voff, &resid); - - ra->voff += ra->bufsize - leftover - resid; - ra->buflen = ra->bufsize - resid; - ra->bufoff = 0; - if (resid == ra->bufsize - leftover) + if (resid == len - done) ra->err = EINVAL; + ra->voff += len - done - resid; + done = len - resid; if (ra->err) return (NULL); - /* Could compute checksum here? */ } - ASSERT3U(ra->bufoff % 8, ==, 0); - ASSERT3U(ra->buflen - ra->bufoff, >=, len); - rv = ra->buf + ra->bufoff; - ra->bufoff += len; + ASSERT3U(done, ==, len); + rv = ra->buf; if (ra->byteswap) - fletcher_4_incremental_byteswap(rv, len, &ra->zc); + fletcher_4_incremental_byteswap(rv, len, &ra->cksum); else - fletcher_4_incremental_native(rv, len, &ra->zc); + fletcher_4_incremental_native(rv, len, &ra->cksum); return (rv); } @@ -531,12 +778,14 @@ backup_byteswap(dmu_replay_record_t *drr) #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_version); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; @@ -643,13 +892,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, drro->drr_bonuslen); - data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (data == NULL) { dmu_tx_commit(tx); return (ra->err); } - bcopy(data, db->db_data, db->db_size); + bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, drro->drr_bonuslen); @@ -673,23 +922,14 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; int err; if (dmu_object_info(os, obj, NULL) != 0) continue; - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); + err = dmu_free_object(os, obj); + if (err) return (err); - } - err = dmu_object_free(os, obj, tx); - dmu_tx_commit(tx); - if (err && err != ENOENT) - return (EINVAL); } return (0); } @@ -735,7 +975,6 @@ static int restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { - dmu_tx_t *tx; int err; if (drrf->drr_length != -1ULL && @@ -745,66 +984,65 @@ restore_free(struct restorearg *ra, objset_t *os, if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (EINVAL); - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, drrf->drr_object, + err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_free_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length, tx); - dmu_tx_commit(tx); return (err); } +void +dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) +{ + if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { + /* + * online incremental or new fs: destroy the fs (which + * may be a clone) that we created + */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (drc->drc_real_ds != drc->drc_logical_ds) + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } else { + /* + * offline incremental: rollback to most recent snapshot. + */ + (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); + dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); + } +} + +/* + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ int -dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct file *fp, uint64_t voffset) +dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) { kthread_t *td = curthread; - struct restorearg ra; + struct restorearg ra = { 0 }; dmu_replay_record_t *drr; - char *cp; - objset_t *os = NULL; - zio_cksum_t pzc; - - bzero(&ra, sizeof (ra)); - ra.td = td; - ra.fp = fp; - ra.voff = voffset; - ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + objset_t *os; + zio_cksum_t pcksum; - if (drrb->drr_magic == DMU_BACKUP_MAGIC) { - ra.byteswap = FALSE; - } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ra.byteswap = TRUE; - } else { - ra.err = EINVAL; - goto out; - } - /* - * NB: this assumes that struct drr_begin will be the largest in - * dmu_replay_record_t's drr_u, and thus we don't need to pad it - * with zeros to make it the same length as we wrote out. - */ - ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; - ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; - ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); - } else { - fletcher_4_incremental_native(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); + { + /* compute checksum of drr_begin record */ + dmu_replay_record_t *drr; + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); } - (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ if (ra.byteswap) { + struct drr_begin *drrb = drc->drc_drrb; drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_version = BSWAP_64(drrb->drr_version); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); @@ -813,94 +1051,30 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - - if (drrb->drr_version != DMU_BACKUP_VERSION || - drrb->drr_type >= DMU_OST_NUMTYPES || - strchr(drrb->drr_toname, '@') == NULL) { - ra.err = EINVAL; - goto out; - } - - /* - * Process the begin in syncing context. - */ - if (drrb->drr_fromguid) { - /* incremental backup */ - dsl_dataset_t *ds = NULL; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); - *cp = '@'; - if (ra.err) - goto out; - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - drrb->drr_fromguid) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(ra.buf, ra.bufsize); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds); - } - ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, - replay_incremental_check, replay_incremental_sync, - ds, drrb, 1); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full backup */ - dsl_dir_t *dd = NULL; - const char *tail; - - /* can't restore full backup into topmost fs, for now */ - if (strrchr(drrb->drr_toname, '/') == NULL) { - ra.err = EINVAL; - goto out; - } - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); - *cp = '@'; - if (ra.err) - goto out; - if (tail == NULL) { - ra.err = EEXIST; - goto out; - } + ra.td = td; + ra.fp = fp; + ra.voff = *voffp; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, - replay_full_sync, dd, drrb, 5); - dsl_dir_close(dd, FTAG); - } - if (ra.err) - goto out; + /* these were verified in dmu_recv_begin */ + ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ + VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, - DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); - *cp = '@'; - ASSERT3U(ra.err, ==, 0); + ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); /* * Read records and process them. */ - pzc = ra.zc; + pcksum = ra.cksum; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (SIGPENDING(td)) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = EINTR; goto out; } @@ -947,63 +1121,116 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, * value, because the stored checksum is of * everything before the DRR_END record. */ - if (drre.drr_checksum.zc_word[0] != 0 && - !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) { + if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ra.err = ECKSUM; - goto out; - } - - ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> - ds_dir->dd_pool, replay_end_check, replay_end_sync, - os, drrb, 3); goto out; } default: ra.err = EINVAL; goto out; } - pzc = ra.zc; + pcksum = ra.cksum; } + ASSERT(ra.err != 0); out: - if (os) - dmu_objset_close(os); + dmu_objset_close(os); - /* - * Make sure we don't rollback/destroy unless we actually - * processed the begin properly. 'os' will only be set if this - * is the case. - */ - if (ra.err && os && tosnap && strchr(tosnap, '@')) { + if (ra.err != 0) { /* * rollback or destroy what we created, so we don't * leave it in the restoring state. */ - dsl_dataset_t *ds; - int err; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - err = dsl_dataset_open(tosnap, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err == 0) { - txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (drrb->drr_fromguid) { - /* incremental: rollback to most recent snap */ - (void) dsl_dataset_rollback(ds); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full: destroy whole fs */ - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - (void) dsl_dataset_destroy(tosnap); - } - } - *cp = '@'; + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + dmu_recv_abort_cleanup(drc); } kmem_free(ra.buf, ra.bufsize); - if (sizep) - *sizep = ra.voff; + *voffp = ra.voff; return (ra.err); } + +struct recvendsyncarg { + char *tosnap; + uint64_t creation_time; + uint64_t toguid; +}; + +static int +recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); +} + +static void +recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; + ds->ds_prev->ds_phys->ds_guid = resa->toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and + * dsl_pool_zil_clean() expects it to have a ds_user_ptr + * (and zil), but clone_swap() can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + if (ds != drc->drc_real_ds) { + /* we are doing an online recv */ + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); + if (err) + dsl_dataset_disown(ds, dmu_recv_tag); + } else { + err = EBUSY; + dsl_dataset_rele(ds, dmu_recv_tag); + } + /* dsl_dataset_destroy() will disown the ds */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (err) + return (err); + } + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + if (drc->drc_newfs) { + ASSERT(ds == drc->drc_real_ds); + (void) dsl_dataset_destroy(ds, dmu_recv_tag); + return (err); + } else { + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + } + + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 3d2bc3e47678..43bf82e7a682 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/spa.h> #include <sys/zio.h> #include <sys/dmu_impl.h> +#include <sys/zvol.h> #define BP_SPAN_SHIFT(level, width) ((level) * (width)) @@ -261,6 +262,16 @@ advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) return (EAGAIN); } +/* + * The traverse_callback function will call the function specified in th_func. + * In the event of an error the callee, specified by th_func, must return + * one of the following errors: + * + * EINTR - Indicates that the callee wants the traversal to + * abort immediately. + * ERESTART - The callee has acknowledged the error and would + * like to continue. + */ static int traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) { @@ -603,7 +614,10 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) th->th_locked = 0; } - rc = traverse_read(th, bc, &dsp->ds_bp, dn); + if (BP_IS_HOLE(&dsp->ds_bp)) + rc = ERESTART; + else + rc = traverse_read(th, bc, &dsp->ds_bp, dn); if (rc != 0) { if (rc == ERESTART) @@ -722,6 +736,24 @@ traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, } int +traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg) +{ + spa_t *spa = dmu_objset_spa(os); + traverse_handle_t *th; + int err; + + th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL); + + traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ); + + while ((err = traverse_more(th)) == EAGAIN) + continue; + + traverse_fini(th); + return (err); +} + +int traverse_more(traverse_handle_t *th) { zseg_t *zseg = list_head(&th->th_seglist); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 13fd8d4d9dce..000c3ce64eb5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dbuf.h> @@ -157,7 +155,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (EIO); - err = dbuf_read(db, zio, DB_RF_CANFAIL); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } @@ -294,6 +292,8 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh) txh->txh_space_tooverwrite += space; } else { txh->txh_space_towrite += space; + if (dn && dn->dn_dbuf->db_blkptr) + txh->txh_space_tounref += space; } } @@ -318,39 +318,25 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { - uint64_t blkid, nblks; - uint64_t space = 0; + uint64_t blkid, nblks, lastblk; + uint64_t space = 0, unref = 0, skipped = 0; dnode_t *dn = txh->txh_dnode; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int dirty; + int epbs; - /* - * We don't need to use any locking to check for dirtyness - * because it's OK if we get stale data -- the dnode may become - * dirty immediately after our check anyway. This is just a - * means to avoid the expensive count when we aren't sure we - * need it. We need to be able to deal with a dirty dnode. - */ - dirty = list_link_active(&dn->dn_dirty_link[0]) | - list_link_active(&dn->dn_dirty_link[1]) | - list_link_active(&dn->dn_dirty_link[2]) | - list_link_active(&dn->dn_dirty_link[3]); - if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) + if (dn->dn_nlevels == 0) return; /* - * the struct_rwlock protects us against dn_phys->dn_nlevels + * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * also, dbuf_hold_impl() wants us to have the struct_rwlock. - * - * It's fine to use dn_datablkshift rather than the dn_phys - * equivalent because if it is changing, maxblkid==0 and we will - * bail. + * Also, dbuf_hold_level() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_phys->dn_maxblkid == 0) { + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_maxblkid == 0) { if (off == 0 && len >= dn->dn_datablksz) { blkid = 0; nblks = 1; @@ -360,78 +346,120 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } } else { blkid = off >> dn->dn_datablkshift; - nblks = (off + len) >> dn->dn_datablkshift; + nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - if (blkid >= dn->dn_phys->dn_maxblkid) { + if (blkid >= dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } - if (blkid + nblks > dn->dn_phys->dn_maxblkid) - nblks = dn->dn_phys->dn_maxblkid - blkid; + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; - /* don't bother after 128,000 blocks */ - nblks = MIN(nblks, 128*1024); } - - if (dn->dn_phys->dn_nlevels == 1) { + if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); + ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); space += bp_get_dasize(spa, bp); } + unref += BP_GET_ASIZE(bp); } nblks = 0; } + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels. + */ + { + uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); + int level = (dn->dn_nlevels > 1) ? 2 : 1; + + while (level++ < DN_MAX_LEVELS) { + txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + ASSERT(blkcnt <= dn->dn_nblkptr); + } + + lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; - int err, epbs, blkoff, tochk; - - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - blkoff = P2PHASE(blkid, 1<<epbs); - tochk = MIN((1<<epbs) - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); - if (err == 0) { - int i; - blkptr_t *bp; - - err = dbuf_read(dbuf, NULL, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } + uint64_t ibyte, new_blkid; + int epb = 1 << epbs; + int err, i, blkoff, tochk; + blkptr_t *bp; + + ibyte = blkid << dn->dn_datablkshift; + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); + new_blkid = ibyte >> dn->dn_datablkshift; + if (err == ESRCH) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + if (err) { + txh->txh_tx->tx_err = err; + break; + } + if (new_blkid > lastblk) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } - bp = dbuf->db.db_data; - bp += blkoff; + if (new_blkid > blkid) { + ASSERT((new_blkid >> epbs) > (blkid >> epbs)); + skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; + nblks -= new_blkid - blkid; + blkid = new_blkid; + } + blkoff = P2PHASE(blkid, epb); + tochk = MIN(epb - blkoff, nblks); - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, - bp[i].blk_birth)) { - dprintf_bp(&bp[i], - "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); - } - } + dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); + + txh->txh_memory_tohold += dbuf->db.db_size; + if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { + txh->txh_tx->tx_err = E2BIG; dbuf_rele(dbuf, FTAG); + break; } - if (err && err != ENOENT) { + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); break; } + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + dprintf_bp(&bp[i], "can free old%s", ""); + space += bp_get_dasize(spa, &bp[i]); + } + unref += BP_GET_ASIZE(bp); + } + dbuf_rele(dbuf, FTAG); + blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* account for new level 1 indirect blocks that might show up */ + if (skipped > 0) { + txh->txh_fudge += skipped << dn->dn_indblkshift; + skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); + txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + } txh->txh_space_tofree += space; + txh->txh_space_tounref += unref; } void @@ -466,7 +494,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) /* * For i/o error checking, read the first and last level-0 * blocks, and all the level-1 blocks. The above count_write's - * will take care of the level-0 blocks. + * have already taken care of the level-0 blocks. */ if (dn->dn_nlevels > 1) { shift = dn->dn_datablkshift + dn->dn_indblkshift - @@ -478,7 +506,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) NULL, NULL, ZIO_FLAG_CANFAIL); for (i = start; i <= end; i++) { uint64_t ibyte = i << shift; - err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH) break; @@ -550,10 +578,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * the size will change between now and the dbuf dirty call. */ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) + dn->dn_phys->dn_blkptr[0].blk_birth)) { txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - else + } else { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += + BP_GET_ASIZE(dn->dn_phys->dn_blkptr); + } return; } @@ -575,7 +606,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks */ dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + add ? 3 : 0) << dn->dn_datablkshift); + (3 + (add ? 3 : 0)) << dn->dn_datablkshift); /* * If the modified blocks are scattered to the four winds, @@ -698,12 +729,13 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) match_offset = TRUE; break; case THT_FREE: - if (blkid == beginblk && - (txh->txh_arg1 != 0 || - dn->dn_maxblkid == 0)) - match_offset = TRUE; - if (blkid == endblk && - txh->txh_arg2 != DMU_OBJECT_END) + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_BONUS: @@ -733,12 +765,32 @@ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { dmu_tx_hold_t *txh; - uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; + spa_t *spa = tx->tx_pool->dp_spa; + uint64_t memory, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ASSERT3U(tx->tx_txg, ==, 0); + if (tx->tx_err) return (tx->tx_err); + if (spa_suspended(spa)) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + txg_how != TXG_WAIT) + return (EIO); + + return (ERESTART); + } + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; @@ -748,7 +800,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = 0; + towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -768,6 +820,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) towrite += txh->txh_space_towrite; tofree += txh->txh_space_tofree; tooverwrite += txh->txh_space_tooverwrite; + tounref += txh->txh_space_tounref; + tohold += txh->txh_memory_tohold; + fudge += txh->txh_fudge; } /* @@ -788,22 +843,31 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) tooverwrite = tofree = 0; } - /* - * Convert logical size to worst-case allocated size. - */ + /* needed allocation: worst-case estimate of write space */ + asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); + /* freed space estimate: worst-case overwrite + free estimate */ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - lsize = towrite + tooverwrite; - asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + /* convert unrefd space to worst-case estimate */ + usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + /* calculate memory footprint estimate */ + memory = towrite + tooverwrite + tohold; #ifdef ZFS_DEBUG - tx->tx_space_towrite = asize; + /* + * Add in 'tohold' to account for our dirty holds on this memory + * XXX - the "fudge" factor is to account for skipped blocks that + * we missed because dnode_next_offset() misses in-core-only blocks. + */ + tx->tx_space_towrite = asize + + spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); tx->tx_space_tofree = tofree; tx->tx_space_tooverwrite = tooverwrite; + tx->tx_space_tounref = tounref; #endif if (tx->tx_dir && asize != 0) { - int err = dsl_dir_tempreserve_space(tx->tx_dir, - lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); if (err) return (err); } @@ -885,10 +949,18 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) void dmu_tx_wait(dmu_tx_t *tx) { + spa_t *spa = tx->tx_pool->dp_spa; + ASSERT(tx->tx_txg == 0); - ASSERT(tx->tx_lasttried_txg != 0); - if (tx->tx_needassign_txh) { + /* + * It's possible that the pool has become active after this thread + * has tried to obtain a tx. If that's the case then his + * tx_lasttried_txg would not have been assigned. + */ + if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); @@ -948,6 +1020,7 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), @@ -975,6 +1048,7 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index b25cc898c37d..8dba38176527 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -38,10 +38,6 @@ */ int zfs_prefetch_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, - &zfs_prefetch_disable, 0, "Disable prefetch"); /* max # of streams per zfetch */ uint32_t zfetch_max_streams = 8; @@ -52,6 +48,25 @@ uint32_t zfetch_block_cap = 256; /* number of bytes in a array_read at which we stop prefetching (1Mb) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, + &zfs_prefetch_disable, 0, "Disable prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN, + &zfetch_max_streams, 0, "Max # of streams per zfetch"); +TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN, + &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); +TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN, + &zfetch_block_cap, 0, "Max number of blocks to fetch at a time"); +TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz); +SYSCTL_QUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN, + &zfetch_array_rd_sz, 0, + "Number of bytes in a array_read at which we stop prefetching"); + /* forward decls for static routines */ static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index ca502857b1fa..5adbc3c0ff5d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dbuf.h> #include <sys/dnode.h> @@ -242,6 +240,23 @@ free_range_compar(const void *node1, const void *node2) else return (0); } +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + static void dnode_setdblksz(dnode_t *dn, int size) { @@ -285,6 +300,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, list_insert_head(&os->os_dnodes, dn); mutex_exit(&os->os_lock); + arc_space_consume(sizeof (dnode_t)); return (dn); } @@ -319,6 +335,7 @@ dnode_destroy(dnode_t *dn) dn->dn_bonus = NULL; } kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t)); } void @@ -362,6 +379,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); @@ -389,6 +407,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -396,7 +415,7 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i; + int i, old_nblkptr; dmu_buf_impl_t *db = NULL; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); @@ -413,7 +432,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT(!list_link_active(&dn->dn_dirty_link[i])); /* clean up any unreferenced dbufs */ - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -436,38 +455,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, } dnode_setdblksz(dn, blocksize); dnode_setdirty(dn, tx); + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; rw_exit(&dn->dn_struct_rwlock); - if (db) { + if (db) dbuf_rele(db, FTAG); - db = NULL; - } /* change type */ dn->dn_type = ot; - if (dn->dn_bonuslen != bonuslen) { - /* change bonus size */ - if (bonuslen == 0) - bonuslen = 1; /* XXX */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); - db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); - if (refcount_add(&db->db_holds, FTAG) == 1) - dnode_add_ref(dn, db); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); - mutex_enter(&db->db_mtx); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); - ASSERT(db->db.db_data != NULL); - db->db.db_size = bonuslen; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - } - /* change bonus size and type */ mutex_enter(&dn->dn_mtx); + old_nblkptr = dn->dn_nblkptr; dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); @@ -475,12 +474,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - /* - * NB: we have to do the dbuf_rele after we've changed the - * dn_bonuslen, for the sake of dbuf_verify(). - */ - if (db) - dbuf_rele(db, FTAG); + /* XXX - for now, we can't make nblkptr smaller */ + ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); + + /* fix up the bonus db_size if dn_nblkptr has changed */ + if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + dn->dn_bonus->db.db_size = + DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); @@ -559,6 +561,12 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, dmu_buf_impl_t *db; dnode_t **children_dnodes; + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything*. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); @@ -602,9 +610,10 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, } if ((dn = children_dnodes[idx]) == NULL) { + dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, - db, object); + + dn = dnode_create(os, dnp, db, object); winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); if (winner != NULL) { dnode_destroy(dn); @@ -644,11 +653,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } -void +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t dnode_add_ref(dnode_t *dn, void *tag) { - ASSERT(refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, tag); + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); } void @@ -656,7 +676,9 @@ dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; + mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && dn->dn_dbuf) dbuf_rele(dn->dn_dbuf, dn); @@ -692,6 +714,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", @@ -714,7 +737,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); @@ -762,7 +785,7 @@ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; - int have_db0 = FALSE; + int err; if (size == 0) size = SPA_MINBLOCKSIZE; @@ -787,9 +810,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if (db->db_blkid == 0) { - have_db0 = TRUE; - } else if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -799,12 +820,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - db = NULL; - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) { - /* obtain the old block */ - db = dbuf_hold(dn, 0, FTAG); + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + if (err == 0) dbuf_new_size(db, size, tx); - } + else if (err != ENOENT) + goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -813,7 +834,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } - + /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); @@ -825,19 +846,32 @@ fail: return (ENOTSUP); } +/* read-holding callers must not rely on the lock being continuously held */ void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; - int drop_struct_lock = FALSE; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DB_BONUS_BLKID); - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - drop_struct_lock = TRUE; + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } } if (blkid <= dn->dn_maxblkid) @@ -889,8 +923,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) } out: - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); + if (have_read) + rw_downgrade(&dn->dn_struct_rwlock); } void @@ -951,15 +985,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; - int blksz, head; + int blksz, blkshift, head, tail; int trunc = FALSE; + int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - /* If the range is past the end of the file, this is a no-op */ - if (off >= blksz * (dn->dn_maxblkid+1)) - goto out; if (len == -1ULL) { len = UINT64_MAX - off; trunc = TRUE; @@ -971,11 +1005,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; don't do any head. */ - head = 0; + /* Freeing the whole block; fast-track this request */ + blkid = 0; + nblks = 1; + goto done; + } else if (off >= blksz) { + /* Freeing past end-of-data */ + goto out; } else { /* Freeing part of the block. */ head = blksz - off; @@ -1008,88 +1049,95 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } /* If the range was less than one block, we're done */ - if (len == 0 || off >= blksz * (dn->dn_maxblkid+1)) + if (len == 0) goto out; - if (!ISP2(blksz)) { - /* - * They are freeing the whole block of a - * non-power-of-two blocksize file. Skip all the messy - * math. - */ - ASSERT3U(off, ==, 0); - ASSERT3U(len, >=, blksz); - blkid = 0; - nblks = 1; - } else { - int tail; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int blkshift = dn->dn_datablkshift; - - /* If the remaining range is past end of file, we're done */ - if (off > dn->dn_maxblkid << blkshift) - goto out; + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; - if (off + len == UINT64_MAX) - tail = 0; - else - tail = P2PHASE(len, blksz); - - ASSERT3U(P2PHASE(off, blksz), ==, 0); - /* zero out any partial block data at the end of the range */ - if (tail) { - if (len < tail) - tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && - !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); - rw_enter(&dn->dn_struct_rwlock, - RW_WRITER); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); + ASSERT(ISP2(blksz)); + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); } - len -= tail; + dbuf_rele(db, FTAG); } - /* If the range did not include a full block, we are done */ - if (len == 0) - goto out; + len -= tail; + } - /* dirty the left indirects */ - if (dn->dn_nlevels > 1 && off != 0) { - db = dbuf_hold_level(dn, 1, - (off - head) >> (blkshift + epbs), FTAG); + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; + + /* + * Read in and mark all the level-1 indirects dirty, + * so that they will stay in memory until syncing phase. + * Always dirty the first and last indirect to make sure + * we dirty all the partial indirects. + */ + if (dn->dn_nlevels > 1) { + uint64_t i, first, last; + int shift = epbs + dn->dn_datablkshift; + + first = blkid >> epbs; + if (db = dbuf_hold_level(dn, 1, first, FTAG)) { dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } - - /* dirty the right indirects */ - if (dn->dn_nlevels > 1 && !trunc) { - db = dbuf_hold_level(dn, 1, - (off + len + tail - 1) >> (blkshift + epbs), FTAG); + if (trunc) + last = dn->dn_maxblkid >> epbs; + else + last = (blkid + nblks - 1) >> epbs; + if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } - - /* - * Finally, add this range to the dnode range list, we - * will finish up this free operation in the syncing phase. - */ - ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); - ASSERT(off + len == UINT64_MAX || - IS_P2ALIGNED(len, 1<<blkshift)); - blkid = off >> blkshift; - nblks = len >> blkshift; - - if (trunc) - dn->dn_maxblkid = (blkid ? blkid - 1 : 0); + for (i = first + 1; i < last; i++) { + uint64_t ibyte = i << shift; + int err; + + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i >= last) + break; + ASSERT(err == 0); + db = dbuf_hold_level(dn, 1, i, FTAG); + if (db) { + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + } } - +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, blkid, nblks, tx); { @@ -1109,9 +1157,12 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } mutex_exit(&dn->dn_mtx); - dbuf_free_range(dn, blkid, nblks, tx); + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: + if (trunc && dn->dn_maxblkid >= (off >> blkshift)) + dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); + rw_exit(&dn->dn_struct_rwlock); } @@ -1179,7 +1230,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; - if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) { + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0); dn->dn_phys->dn_used = space >> DEV_BSHIFT; @@ -1211,7 +1262,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) } static int -dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; @@ -1219,11 +1270,16 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; - int i, error, span; + boolean_t hole; + int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + hole = flags & DNODE_FIND_HOLE; + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + ASSERT(txg == 0 || !hole); + if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; @@ -1232,9 +1288,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); if (error) { - if (error == ENOENT) - return (hole ? 0 : ESRCH); - return (error); + if (error != ENOENT) + return (error); + if (hole) + return (0); + /* + * This can only happen when we are searching up + * the block tree for data. We don't really need to + * adjust the offset, as we will just end up looking + * at the pointer to this block in its parent, and its + * going to be unallocated, so we will skip over it. + */ + return (ESRCH); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { @@ -1246,13 +1311,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, if (db && txg && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + /* + * This can only happen when we are searching up the tree + * and these conditions mean that we need to keep climbing. + */ error = ESRCH; } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); - for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { + for (i = (*offset >> span) & (blkfill - 1); + i >= 0 && i < blkfill; i += inc) { boolean_t newcontents = B_TRUE; if (txg) { int j; @@ -1264,9 +1334,9 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, } if (!dnp[i].dn_type == hole && newcontents) break; - *offset += 1ULL << span; + *offset += (1ULL << span) * inc; } - if (i == blkfill) + if (i < 0 || i == blkfill) error = ESRCH; } else { blkptr_t *bp = data; @@ -1280,14 +1350,17 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, minfill++; for (i = (*offset >> span) & ((1ULL << epbs) - 1); - i < epb; i++) { + i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && - bp[i].blk_birth > txg) + (hole || bp[i].blk_birth > txg)) break; - *offset += 1ULL << span; + if (inc < 0 && *offset < (1ULL << span)) + *offset = 0; + else + *offset += (1ULL << span) * inc; } - if (i >= epb) + if (i < 0 || i == epb) error = ESRCH; } @@ -1306,64 +1379,66 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, * * Examples: * - * dnode_next_offset(dn, hole, offset, 1, 1, 0); - * Finds the next hole/data in a file. + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * - * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg); + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * - * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int -dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { + uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; - uint64_t initial_offset = *offset; - rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { - rw_exit(&dn->dn_struct_rwlock); - return (ESRCH); + error = ESRCH; + goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { - if (hole) + if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = ESRCH; } - rw_exit(&dn->dn_struct_rwlock); - return (error); + goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } - while (--lvl >= minlvl && error == 0) { + while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); } - rw_exit(&dn->dn_struct_rwlock); - - if (error == 0 && initial_offset > *offset) + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) error = ESRCH; +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 9e8c7adbda01..a46d4e70abc8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,9 +55,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db != NULL); dn->dn_phys->dn_nlevels = new_level; - dprintf("os=%p obj=%llu, increase to %d\n", - dn->dn_objset, dn->dn_object, - dn->dn_phys->dn_nlevels); + dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, + dn->dn_object, dn->dn_phys->dn_nlevels); /* check for existing blkptrs in the dnode */ for (i = 0; i < nblkptr; i++) @@ -110,25 +109,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } -static void +static int free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i; + int i, blocks_freed = 0; - dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; - bytesfreed += bp_get_dasize(os->os_spa, bp); + bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); bzero(bp, sizeof (blkptr_t)); + blocks_freed += 1; } dnode_diduse_space(dn, -bytesfreed); + return (blocks_freed); } #ifdef ZFS_DEBUG @@ -160,7 +160,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(db->db_dnode, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FTAG, &child); rw_exit(&db->db_dnode->dn_struct_rwlock); if (err == ENOENT) continue; @@ -178,7 +178,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", - child, i, off, num); + (void *)child, i, off, num); } } } @@ -195,7 +195,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", - child, i, off, num); + (void *)child, i, off, num); } } } @@ -206,6 +206,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +#define ALL -1 + static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) @@ -216,8 +218,18 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, uint64_t start, end, dbstart, dbend, i; int epbs, shift, err; int all = TRUE; + int blocks_freed = 0; + + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if we didn't get a dirty hold (because this block had just + * finished being written -- and so had no holds), and then this + * block got evicted before we got here. + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); arc_release(db->db_buf, db); bp = (blkptr_t *)db->db.db_data; @@ -241,10 +253,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } for (i = start; i <= end; i++, bp++) { @@ -255,9 +267,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(subdb, blkid, nblks, trunc, tx)) { + if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(subdb->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + blocks_freed += free_blocks(dn, bp, 1, tx); } else { all = FALSE; } @@ -274,8 +286,8 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } /* @@ -305,15 +317,14 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - free_blocks(dn, bp + blkid, nblks, tx); + (void) free_blocks(dn, bp + blkid, nblks, tx); if (trunc) { uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, - 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } return; } @@ -331,9 +342,9 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(db, blkid, nblks, trunc, tx)) { + if (free_children(db, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(db->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + (void) free_blocks(dn, bp, 1, tx); } dbuf_rele(db, FTAG); } @@ -343,15 +354,15 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } /* * Try to kick all the dnodes dbufs out of the cache... */ -int -dnode_evict_dbufs(dnode_t *dn, int try) +void +dnode_evict_dbufs(dnode_t *dn) { int progress; int pass = 0; @@ -367,6 +378,7 @@ dnode_evict_dbufs(dnode_t *dn, int try) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); + ASSERT3P(db->db_dnode, ==, dn); mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { @@ -375,7 +387,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) mutex_exit(&db->db_mtx); } else if (refcount_is_zero(&db->db_holds)) { progress = TRUE; - ASSERT(!arc_released(db->db_buf)); dbuf_clear(db); /* exits db_mtx for us */ } else { mutex_exit(&db->db_mtx); @@ -397,21 +408,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) ASSERT(pass < 100); /* sanity check */ } while (progress); - /* - * This function works fine even if it can't evict everything. - * If were only asked to try to evict everything then - * return an error if we can't. Otherwise panic as the caller - * expects total eviction. - */ - if (list_head(&dn->dn_dbufs) != NULL) { - if (try) { - return (1); - } else { - panic("dangling dbufs (dn=%p, dbuf=%p)\n", - dn, list_head(&dn->dn_dbufs)); - } - } - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); @@ -419,7 +415,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); - return (0); } static void @@ -460,8 +455,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -479,10 +481,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; - /* free up all the blocks in the file. */ - dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); - /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); @@ -496,6 +494,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_type = DMU_OT_NONE; dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -558,7 +557,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - list_head(list) != NULL || + dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -566,6 +565,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_blksz[txgoff] = 0; } + if (dn->dn_next_bonuslen[txgoff]) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + dn->dn_next_bonuslen[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -583,20 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); /* process all the "freed" ranges in the file */ - if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { - for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL; - rp = AVL_PREV(&dn->dn_ranges[txgoff], rp)) - dnode_sync_free_range(dn, - rp->fr_blkid, rp->fr_nblks, tx); + while (rp = avl_last(&dn->dn_ranges[txgoff])) { + dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); + /* grab the mutex so we don't race with dnode_block_freed() */ + mutex_enter(&dn->dn_mtx); + avl_remove(&dn->dn_ranges[txgoff], rp); + mutex_exit(&dn->dn_mtx); + kmem_free(rp, sizeof (free_range_t)); } - mutex_enter(&dn->dn_mtx); - for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { - free_range_t *last = rp; - rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); - avl_remove(&dn->dn_ranges[txgoff], last); - kmem_free(last, sizeof (free_range_t)); - } - mutex_exit(&dn->dn_mtx); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { dnode_sync_free(dn, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 7d4689f3352a..20d8ec85cc91 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu_objset.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> @@ -38,35 +36,44 @@ #include <sys/unique.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> +#include <sys/spa.h> +#include <sys/zfs_znode.h> +#include <sys/sunddi.h> + +static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_checkfunc_t dsl_dataset_rollback_check; static dsl_syncfunc_t dsl_dataset_rollback_sync; -static dsl_checkfunc_t dsl_dataset_destroy_check; -static dsl_syncfunc_t dsl_dataset_destroy_sync; +static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE +#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) + + /* - * We use weighted reference counts to express the various forms of exclusion - * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open - * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE. - * This makes the exclusion logic simple: the total refcnt for all opens cannot - * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their - * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume - * just over half of the refcnt space, so there can't be more than one, but it - * can peacefully coexist with any number of STANDARD opens. + * Figure out how much of this delta should be propogated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. */ -static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = { - 0, /* DS_MODE_NONE - invalid */ - 1, /* DS_MODE_STANDARD - unlimited number */ - (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */ - DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */ -}; +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + uint64_t old_bytes, new_bytes; + if (ds->ds_reserved == 0) + return (delta); + + old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) @@ -74,6 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; dprintf_bp(bp, "born, ds=%p\n", ds); @@ -89,23 +97,28 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) * dsl_dir. */ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, used, compressed, uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); ds->ds_phys->ds_used_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - used, compressed, uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); } -void +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx) { @@ -113,10 +126,11 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); + ASSERT(pio != NULL); ASSERT(dmu_tx_is_syncing(tx)); /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) - return; + return (0); ASSERT(used > 0); if (ds == NULL) { @@ -125,51 +139,59 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, * Account for the meta-objset space in its placeholder * dataset. */ - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); ASSERT(err == 0); - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, -used, -compressed, -uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); - return; + return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int err; + int64_t delta; dprintf_bp(bp, "freeing: %s", ""); - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); ASSERT(err == 0); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - /* XXX unique_bytes is not accurate for head datasets */ - /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ + ASSERT(ds->ds_phys->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (ds->ds_phys->ds_prev_snap_obj != 0) { - ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > - ds->ds_prev->ds_phys->ds_prev_snap_txg) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - mutex_enter(&ds->ds_prev->ds_lock); - ds->ds_prev->ds_phys->ds_unique_bytes += - used; - mutex_exit(&ds->ds_prev->ds_lock); - } + if (ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + ds->ds_prev->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_prev->ds_lock); + } + if (bp->blk_birth > ds->ds_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); @@ -180,6 +202,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ds->ds_phys->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); + + return (used); } uint64_t @@ -216,32 +240,38 @@ static void dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - /* open_refcount == DS_REF_MAX when deleting */ - ASSERT(ds->ds_open_refcount == 0 || - ds->ds_open_refcount == DS_REF_MAX); + ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); dprintf_ds(ds, "evicting %s\n", ""); - unique_remove(ds->ds_phys->ds_fsid_guid); + unique_remove(ds->ds_fsid_guid); if (ds->ds_user_ptr != NULL) ds->ds_user_evict_func(ds, ds->ds_user_ptr); if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + dsl_dataset_drop_ref(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_close(&ds->ds_deadlist); - dsl_dir_close(ds->ds_dir, ds); + if (ds->ds_dir) + dsl_dir_close(ds->ds_dir, ds); - if (list_link_active(&ds->ds_synced_link)) - list_remove(&dp->dp_synced_objsets, ds); + ASSERT(!list_link_active(&ds->ds_synced_link)); + if (mutex_owned(&ds->ds_lock)) + mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); + if (mutex_owned(&ds->ds_opening_lock)) + mutex_exit(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_opening_lock); + if (mutex_owned(&ds->ds_deadlist.bpl_lock)) + mutex_exit(&ds->ds_deadlist.bpl_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); } @@ -266,16 +296,54 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); + headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); return (err); } -int -dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, - int mode, void *tag, dsl_dataset_t **dsp) +static int +dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_lookup_norm(mos, snapobj, name, 8, 1, + value, mt, NULL, 0, NULL); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_lookup(mos, snapobj, name, 8, 1, value); + return (err); +} + +static int +dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_remove_norm(mos, snapobj, name, mt, tx); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_remove(mos, snapobj, name, tx); + return (err); +} + +static int +dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) { - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; @@ -297,8 +365,11 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&ds->ds_rwlock, 0, 0, 0); + cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); err = bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); @@ -312,42 +383,65 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, * just opened it. */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { + if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds, &ds->ds_prev); + err = dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds, &ds->ds_prev); } - } else { - if (snapname) { -#ifdef ZFS_DEBUG - dsl_dataset_phys_t *headphys; - dmu_buf_t *headdbuf; - err = dmu_bonus_hold(mos, - ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &headdbuf); + + if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *origin; + + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, + FTAG, &origin); if (err == 0) { - headphys = headdbuf->db_data; - uint64_t foundobj; - err = zap_lookup(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, - snapname, sizeof (foundobj), 1, - &foundobj); - ASSERT3U(foundobj, ==, dsobj); - dmu_buf_rele(headdbuf, FTAG); + ds->ds_origin_txg = + origin->ds_phys->ds_creation_txg; + dsl_dataset_rele(origin, FTAG); } -#endif - (void) strcat(ds->ds_snapname, snapname); - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); } + } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { + err = dsl_dataset_get_snapname(ds); + } + + if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + /* + * In sync context, we're called with either no lock + * or with the write lock. If we're not syncing, + * we're always called with the read lock held. + */ + boolean_t need_lock = + !RW_WRITE_HELD(&dp->dp_config_rwlock) && + dsl_pool_sync_context(dp); + + if (need_lock) + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_ds(ds, + "refreservation", sizeof (uint64_t), 1, + &ds->ds_reserved, NULL); + if (err == 0) { + err = dsl_prop_get_ds(ds, + "refquota", sizeof (uint64_t), 1, + &ds->ds_quota, NULL); + } + + if (need_lock) + rw_exit(&dp->dp_config_rwlock); + } else { + ds->ds_reserved = ds->ds_quota = 0; } if (err == 0) { @@ -356,13 +450,14 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, } if (err || winner) { bplist_close(&ds->ds_deadlist); - if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, - DS_MODE_NONE, ds); - } + if (ds->ds_prev) + dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); @@ -370,101 +465,175 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, } ds = winner; } else { - uint64_t new = + ds->ds_fsid_guid = unique_insert(ds->ds_phys->ds_fsid_guid); - if (new != ds->ds_phys->ds_fsid_guid) { - /* XXX it won't necessarily be synced... */ - ds->ds_phys->ds_fsid_guid = new; - } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(ds->ds_phys, ==, dbuf->db_data); - + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || + spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || + dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); mutex_enter(&ds->ds_lock); - if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY && - (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) && - !DS_MODE_IS_INCONSISTENT(mode)) || - (ds->ds_open_refcount + weight > DS_REF_MAX)) { + if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); - dsl_dataset_close(ds, DS_MODE_NONE, tag); - return (EBUSY); + dmu_buf_rele(ds->ds_dbuf, tag); + return (ENOENT); } - ds->ds_open_refcount += weight; mutex_exit(&ds->ds_lock); - *dsp = ds; return (0); } +static int +dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* + * In syncing context we don't want the rwlock lock: there + * may be an existing writer waiting for sync phase to + * finish. We don't need to worry about such writers, since + * sync phase is single-threaded, so the writer can't be + * doing anything while we are active. + */ + if (dsl_pool_sync_context(dp)) { + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + return (0); + } + + /* + * Normal users will hold the ds_rwlock as a READER until they + * are finished (i.e., call dsl_dataset_rele()). "Owners" will + * drop their READER lock after they set the ds_owner field. + * + * If the dataset is being destroyed, the destroy thread will + * obtain a WRITER lock for exclusive access after it's done its + * open-context work and then change the ds_owner to + * dsl_reaper once destruction is assured. So threads + * may block here temporarily, until the "destructability" of + * the dataset is determined. + */ + ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); + mutex_enter(&ds->ds_lock); + while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { + rw_exit(&dp->dp_config_rwlock); + cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); + if (DSL_DATASET_IS_DESTROYED(ds)) { + mutex_exit(&ds->ds_lock); + dsl_dataset_drop_ref(ds, tag); + rw_enter(&dp->dp_config_rwlock, RW_READER); + return (ENOENT); + } + rw_enter(&dp->dp_config_rwlock, RW_READER); + } + mutex_exit(&ds->ds_lock); + return (0); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); + + if (err) + return (err); + return (dsl_dataset_hold_ref(*dsp, tag)); +} + int -dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, - void *tag, dsl_dataset_t **dsp) +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); + + ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); + + if (err) + return (err); + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); +} + +int +dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; dsl_pool_t *dp; - const char *tail; + const char *snapname; uint64_t obj; - dsl_dataset_t *ds = NULL; int err = 0; - err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail); + err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); if (err) return (err); dp = dd->dd_pool; obj = dd->dd_phys->dd_head_dataset_obj; rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj == 0) { - /* A dataset with no associated objset */ + if (obj) + err = dsl_dataset_get_ref(dp, obj, tag, dsp); + else err = ENOENT; + if (err) goto out; - } - if (tail != NULL) { - objset_t *mos = dp->dp_meta_objset; + err = dsl_dataset_hold_ref(*dsp, tag); - err = dsl_dataset_open_obj(dp, obj, NULL, - DS_MODE_NONE, tag, &ds); - if (err) - goto out; - obj = ds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(ds, DS_MODE_NONE, tag); - ds = NULL; + /* we may be looking for a snapshot */ + if (err == 0 && snapname != NULL) { + dsl_dataset_t *ds = NULL; - if (tail[0] != '@') { + if (*snapname++ != '@') { + dsl_dataset_rele(*dsp, tag); err = ENOENT; goto out; } - tail++; - /* Look for a snapshot */ - if (!DS_MODE_IS_READONLY(mode)) { - err = EROFS; - goto out; + dprintf("looking for snapshot '%s'\n", snapname); + err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + if (err == 0) + err = dsl_dataset_get_ref(dp, obj, tag, &ds); + dsl_dataset_rele(*dsp, tag); + + ASSERT3U((err == 0), ==, (ds != NULL)); + + if (ds) { + mutex_enter(&ds->ds_lock); + if (ds->ds_snapname[0] == 0) + (void) strlcpy(ds->ds_snapname, snapname, + sizeof (ds->ds_snapname)); + mutex_exit(&ds->ds_lock); + err = dsl_dataset_hold_ref(ds, tag); + *dsp = err ? NULL : ds; } - dprintf("looking for snapshot '%s'\n", tail); - err = zap_lookup(mos, obj, tail, 8, 1, &obj); - if (err) - goto out; } - err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds); - out: rw_exit(&dp->dp_config_rwlock); dsl_dir_close(dd, FTAG); - - ASSERT3U((err == 0), ==, (ds != NULL)); - /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */ - - *dsp = ds; return (err); } int -dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp) +dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) { - return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp)); + int err = dsl_dataset_hold(name, owner, dsp); + if (err) + return (err); + if ((*dsp)->ds_phys->ds_num_children > 0 && + !DS_MODE_IS_READONLY(flags)) { + dsl_dataset_rele(*dsp, owner); + return (EROFS); + } + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); } void @@ -477,11 +646,11 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ if (!MUTEX_HELD(&ds->ds_lock)) { - /* - * We use a "recursive" mutex so that we - * can call dprintf_ds() with ds_lock held. - */ mutex_enter(&ds->ds_lock); (void) strcat(name, ds->ds_snapname); mutex_exit(&ds->ds_lock); @@ -505,7 +674,6 @@ dsl_dataset_namelen(dsl_dataset_t *ds) if (ds->ds_snapname[0]) { ++result; /* adding one for the @-sign */ if (!MUTEX_HELD(&ds->ds_lock)) { - /* see dsl_datset_name */ mutex_enter(&ds->ds_lock); result += strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); @@ -519,119 +687,160 @@ dsl_dataset_namelen(dsl_dataset_t *ds) } void -dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) +dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) { - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +{ + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { + rw_exit(&ds->ds_rwlock); + } + dsl_dataset_drop_ref(ds, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, void *owner) +{ + ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || + (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_open_refcount, >=, weight); - ds->ds_open_refcount -= weight; - dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n", - mode, ds->ds_open_refcount); + ds->ds_owner = NULL; + if (RW_WRITE_HELD(&ds->ds_rwlock)) { + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + } mutex_exit(&ds->ds_lock); + if (ds->ds_dbuf) + dsl_dataset_drop_ref(ds, owner); + else + dsl_dataset_evict(ds->ds_dbuf, ds); +} - dmu_buf_rele(ds->ds_dbuf, tag); +boolean_t +dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) +{ + boolean_t gotit = FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_owner == NULL && + (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + ds->ds_owner = owner; + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) + rw_exit(&ds->ds_rwlock); + gotit = TRUE; + } + mutex_exit(&ds->ds_lock); + return (gotit); } void -dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) +dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) { - objset_t *mos = dp->dp_meta_objset; + ASSERT3P(owner, ==, ds->ds_owner); + if (!RW_WRITE_HELD(&ds->ds_rwlock)) + rw_enter(&ds->ds_rwlock, RW_WRITER); +} + +uint64_t +dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; - dsl_dataset_t *ds; uint64_t dsobj; - dsl_dir_t *dd; + objset_t *mos = dp->dp_meta_objset; - dsl_dir_create_root(mos, ddobjp, tx); - VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd)); + if (origin == NULL) + origin = dp->dp_origin_snap; + + ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); + ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, + DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; dsphys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + + if (origin) { + dsphys->ds_prev_snap_obj = origin->ds_object; + dsphys->ds_prev_snap_txg = + origin->ds_phys->ds_creation_txg; + dsphys->ds_used_bytes = + origin->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = + origin->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + origin->ds_phys->ds_uncompressed_bytes; + dsphys->ds_bp = origin->ds_phys->ds_bp; + dsphys->ds_flags |= origin->ds_phys->ds_flags; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + origin->ds_phys->ds_num_children++; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { + if (origin->ds_phys->ds_next_clones_obj == 0) { + origin->ds_phys->ds_next_clones_obj = + zap_create(mos, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(mos, + origin->ds_phys->ds_next_clones_obj, + dsobj, tx)); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_origin_obj = origin->ds_object; + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; - dsl_dir_close(dd, FTAG); - VERIFY(0 == - dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); - (void) dmu_objset_create_impl(dp->dp_spa, ds, - &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (dsobj); } uint64_t -dsl_dataset_create_sync(dsl_dir_t *pdd, - const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx) +dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; uint64_t dsobj, ddobj; - objset_t *mos = dp->dp_meta_objset; dsl_dir_t *dd; - ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp); - ASSERT(clone_parent == NULL || - clone_parent->ds_phys->ds_num_children > 0); ASSERT(lastname[0] != '@'); - ASSERT(dmu_tx_is_syncing(tx)); - ddobj = dsl_dir_create_sync(pdd, lastname, tx); + ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - dsphys->ds_dir_obj = dd->dd_object; - dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - if (clone_parent) { - dsphys->ds_prev_snap_obj = clone_parent->ds_object; - dsphys->ds_prev_snap_txg = - clone_parent->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - clone_parent->ds_phys->ds_used_bytes; - dsphys->ds_compressed_bytes = - clone_parent->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = - clone_parent->ds_phys->ds_uncompressed_bytes; - dsphys->ds_bp = clone_parent->ds_phys->ds_bp; + dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); - dmu_buf_will_dirty(clone_parent->ds_dbuf, tx); - clone_parent->ds_phys->ds_num_children++; + dsl_deleg_set_create_perms(dd, tx, cr); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; - } - dmu_buf_rele(dbuf, FTAG); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; dsl_dir_close(dd, FTAG); return (dsobj); @@ -653,21 +862,24 @@ dsl_snapshot_destroy_one(char *name, void *arg) (void) strcat(name, "@"); (void) strcat(name, da->snapname); - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, da->dstg, &ds); cp = strchr(name, '@'); *cp = '\0'; - if (err == ENOENT) - return (0); - if (err) { + if (err == 0) { + dsl_dataset_make_exclusive(ds, da->dstg); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, da->dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { (void) strcpy(da->failed, name); - return (err); } - - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); - return (0); + return (err); } /* @@ -681,16 +893,8 @@ dsl_snapshots_destroy(char *fsname, char *snapname) struct destroyarg da; dsl_sync_task_t *dst; spa_t *spa; - char *cp; - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) return (err); da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); @@ -706,17 +910,14 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; + /* + * Return the file system name that triggered the error + */ if (dst->dst_err) { dsl_dataset_name(ds, fsname); - cp = strchr(fsname, '@'); - *cp = '\0'; + *strchr(fsname, '@') = '\0'; } - /* - * If it was successful, destroy_sync would have - * closed the ds - */ - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg); + dsl_dataset_disown(ds, da.dstg); } dsl_sync_task_group_destroy(da.dstg); @@ -724,36 +925,33 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +/* + * ds must be opened as OWNER. On return (whether successful or not), + * ds will be closed and caller can no longer dereference it. + */ int -dsl_dataset_destroy(const char *name) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) { int err; dsl_sync_task_group_t *dstg; objset_t *os; - dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t obj; - if (strchr(name, '@')) { + if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err) - return (err); + dsl_dataset_make_exclusive(ds, tag); + + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, FTAG, 0); - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); + ds, tag, 0); + goto out; } - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err) - return (err); - ds = os->os->os_dsl_dataset; dd = ds->ds_dir; /* @@ -762,10 +960,12 @@ dsl_dataset_destroy(const char *name) */ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) { - dmu_objset_close(os); - return (err); - } + if (err) + goto out; + + err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + if (err) + goto out; /* * remove the objects in open context, so that we won't @@ -773,66 +973,73 @@ dsl_dataset_destroy(const char *name) */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - /* - * Perhaps there is not enough disk - * space. Just deal with it from - * dsl_dataset_destroy_sync(). - */ - dmu_tx_abort(tx); - continue; - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - dmu_tx_commit(tx); + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); } - /* Make sure it's not dirty before we finish destroying it. */ - txg_wait_synced(dd->dd_pool, 0); dmu_objset_close(os); if (err != ESRCH) - return (err); + goto out; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); + rw_exit(&dd->dd_pool->dp_config_rwlock); - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); if (err) - return (err); + goto out; - err = dsl_dir_open(name, FTAG, &dd, NULL); - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); + if (ds->ds_user_ptr) { + /* + * We need to sync out all in-flight IO before we try + * to evict (the dataset evict func is trying to clear + * the cached entries for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); } /* * Blow away the dsl_dir + head dataset. */ + dsl_dataset_make_exclusive(ds, tag); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, FTAG, 0); + dsl_dataset_destroy_sync, ds, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, dsl_dir_destroy_sync, dd, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); - /* if it is successful, *destroy_sync will close the ds+dd */ - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ + if (err) dsl_dir_close(dd, FTAG); - } +out: + dsl_dataset_disown(ds, tag); return (err); } int -dsl_dataset_rollback(dsl_dataset_t *ds) +dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) { - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); - return (dsl_sync_task_do(ds->ds_dir->dd_pool, + int err; + + ASSERT(ds->ds_owner); + + dsl_dataset_make_exclusive(ds, ds->ds_owner); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_rollback_check, dsl_dataset_rollback_sync, - ds, NULL, 0)); + ds, &ost, 0); + /* drop exclusive access */ + mutex_enter(&ds->ds_lock); + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + return (err); } void * @@ -904,14 +1111,56 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) } } +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +static void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + + if (ds->ds_phys->ds_prev_snap_obj != 0) + mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + else + mrs_used = 0; + + VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, + &dluncomp)); + + ASSERT3U(dlused, <=, mrs_used); + ds->ds_phys->ds_unique_bytes = + ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + + if (!DS_UNIQUE_IS_ACCURATE(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +static uint64_t +dsl_dataset_unique(dsl_dataset_t *ds) +{ + if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) + dsl_dataset_recalc_head_uniq(ds); + + return (ds->ds_phys->ds_unique_bytes); +} + struct killarg { - uint64_t *usedp; - uint64_t *compressedp; - uint64_t *uncompressedp; + dsl_dataset_t *ds; zio_t *zio; dmu_tx_t *tx; }; +/* ARGSUSED */ static int kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) { @@ -920,16 +1169,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) ASSERT3U(bc->bc_errno, ==, 0); - /* - * Since this callback is not called concurrently, no lock is - * needed on the accounting values. - */ - *ka->usedp += bp_get_dasize(spa, bp); - *ka->compressedp += BP_GET_PSIZE(bp); - *ka->uncompressedp += BP_GET_UCSIZE(bp); - /* XXX check for EIO? */ - (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL, - ARC_NOWAIT); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + return (0); } @@ -938,14 +1180,12 @@ static int dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; /* - * There must be a previous snapshot. I suppose we could roll - * it back to being empty (and re-initialize the upper (ZPL) - * layer). But for now there's no way to do this via the user - * interface. + * We can only roll back to emptyness if it is a ZPL objset. */ - if (ds->ds_phys->ds_prev_snap_txg == 0) + if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) return (EINVAL); /* @@ -966,13 +1206,44 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * Before the roll back destroy the zil. + */ + if (ds->ds_user_ptr != NULL) { + zil_rollback_destroy( + ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); + + /* + * We need to make sure that the objset_impl_t is reopened after + * we do the rollback, otherwise it will have the wrong + * objset_phys_t. Normally this would happen when this + * dataset-open is closed, thus causing the + * dataset to be immediately evicted. But when doing "zfs recv + * -F", we reopen the objset before that, so that there is no + * window where the dataset is closed and inconsistent. + */ + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + + /* Transfer space that was freed since last snap back to the head. */ + { + uint64_t used; + + VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, + ds->ds_origin_txg, UINT64_MAX, &used)); + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_SNAP, DD_USED_HEAD, tx); + } + /* Zero out the deadlist. */ bplist_close(&ds->ds_deadlist); bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); @@ -984,39 +1255,65 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) { /* Free blkptrs that we gave birth to */ zio_t *zio; - uint64_t used = 0, compressed = 0, uncompressed = 0; struct killarg ka; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); (void) zio_wait(zio); - - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); } - /* Change our contents to that of the prev snapshot */ - ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); - ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; - ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; - ds->ds_phys->ds_compressed_bytes = - ds->ds_prev->ds_phys->ds_compressed_bytes; - ds->ds_phys->ds_uncompressed_bytes = - ds->ds_prev->ds_phys->ds_uncompressed_bytes; - ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - ds->ds_phys->ds_unique_bytes = 0; + ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || + ds->ds_phys->ds_unique_bytes == 0); + + if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { + /* Change our contents to that of the prev snapshot */ + + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT3U(ds->ds_phys->ds_used_bytes, <=, + ds->ds_prev->ds_phys->ds_used_bytes); + + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; + ds->ds_phys->ds_used_bytes = + ds->ds_prev->ds_phys->ds_used_bytes; + ds->ds_phys->ds_compressed_bytes = + ds->ds_prev->ds_phys->ds_compressed_bytes; + ds->ds_phys->ds_uncompressed_bytes = + ds->ds_prev->ds_phys->ds_uncompressed_bytes; + ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; + + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_unique_bytes = 0; + } + } else { + objset_impl_t *osi; + + ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_unique_bytes = 0; + bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); + ds->ds_phys->ds_flags = 0; + ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, + &ds->ds_phys->ds_bp, *ost, tx); +#ifdef _KERNEL + zfs_create_fs(&osi->os, kcred, NULL, tx); +#endif } + + spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "dataset = %llu", ds->ds_object); } /* ARGSUSED */ @@ -1024,6 +1321,9 @@ static int dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; /* * Can't delete a head dataset if there are snapshots of it. @@ -1034,26 +1334,44 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EINVAL); + /* + * This is really a dsl_dir thing, but check it here so that + * we'll be less likely to leave this dataset inconsistent & + * nearly destroyed. + */ + err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + if (err) + return (err); + if (count != 0) + return (EEXIST); + return (0); } /* ARGSUSED */ static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); } /* ARGSUSED */ -static int +int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + /* we have an owner hold, so noone else can destroy us */ + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + /* Can't delete a branch point. */ if (ds->ds_phys->ds_num_children > 1) return (EEXIST); @@ -1078,11 +1396,50 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } +struct refsarg { + kmutex_t lock; + boolean_t gone; + kcondvar_t cv; +}; + +/* ARGSUSED */ +static void +dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +{ + struct refsarg *arg = argv; + + mutex_enter(&arg->lock); + arg->gone = TRUE; + cv_signal(&arg->cv); + mutex_exit(&arg->lock); +} + static void -dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) +{ + struct refsarg arg; + + mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); + arg.gone = FALSE; + (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, + dsl_dataset_refs_gone); + dmu_buf_rele(ds->ds_dbuf, tag); + mutex_enter(&arg.lock); + while (!arg.gone) + cv_wait(&arg.cv, &arg.lock); + ASSERT(arg.gone); + mutex_exit(&arg.lock); + ds->ds_dbuf = NULL; + ds->ds_phys = NULL; + mutex_destroy(&arg.lock); + cv_destroy(&arg.cv); +} + +void +dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t used = 0, compressed = 0, uncompressed = 0; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1091,29 +1448,53 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_dataset_t *ds_prev = NULL; uint64_t obj; - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); + ASSERT(ds->ds_owner); ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + /* signal any waiters that this dataset is going away */ + mutex_enter(&ds->ds_lock); + ds->ds_owner = dsl_reaper; + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + uint64_t val = 0; + dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + ASSERT3U(ds->ds_reserved, ==, 0); + } + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + dsl_pool_ds_destroyed(ds, tx); + obj = ds->ds_object; if (ds->ds_phys->ds_prev_snap_obj != 0) { if (ds->ds_prev) { ds_prev = ds->ds_prev; } else { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_prev)); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); } after_branch_point = (ds_prev->ds_phys->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && + ds_prev->ds_phys->ds_next_clones_obj != 0) { + VERIFY(0 == zap_remove_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(0 == zap_add_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, + ds->ds_phys->ds_next_snap_obj, tx)); + } + } + if (after_branch_point && ds->ds_phys->ds_next_snap_obj == 0) { /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); @@ -1130,14 +1511,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) blkptr_t bp; dsl_dataset_t *ds_next; uint64_t itor = 0; + uint64_t old_unique; + int64_t used = 0, compressed = 0, uncompressed = 0; - spa_scrub_restart(dp->dp_spa, tx->tx_txg); - - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_next)); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + old_unique = dsl_dataset_unique(ds_next); + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; @@ -1154,8 +1536,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * * XXX we're doing this long task with the config lock held */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, - &bp) == 0) { + while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, &bp, tx)); @@ -1170,16 +1551,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) compressed += BP_GET_PSIZE(&bp); uncompressed += BP_GET_UCSIZE(&bp); /* XXX check return value? */ - (void) arc_free(zio, dp->dp_spa, tx->tx_txg, + (void) dsl_free(zio, dp, tx->tx_txg, &bp, NULL, NULL, ARC_NOWAIT); } } + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -compressed, -uncompressed, tx); + /* free next's deadlist */ bplist_close(&ds_next->ds_deadlist); bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); /* set next's deadlist to our deadlist */ + bplist_close(&ds->ds_deadlist); ds_next->ds_phys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, @@ -1200,51 +1588,50 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * config lock held */ dsl_dataset_t *ds_after_next; + uint64_t space; - VERIFY(0 == dsl_dataset_open_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_after_next)); - itor = 0; - while (bplist_iterate(&ds_after_next->ds_deadlist, - &itor, &bp) == 0) { - if (bp.blk_birth > - ds->ds_phys->ds_prev_snap_txg && - bp.blk_birth <= - ds->ds_phys->ds_creation_txg) { - ds_next->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, + FTAG, &ds_after_next)); + + VERIFY(0 == + bplist_space_birthrange(&ds_after_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, &space)); + ds_next->ds_phys->ds_unique_bytes += space; - dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds_after_next, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); } else { - /* - * It would be nice to update the head dataset's - * unique. To do so we would have to traverse - * it for blocks born after ds_prev, which is - * pretty expensive just to maintain something - * for debugging purposes. - */ ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, - ds_next); + dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; if (ds_prev) { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds_next, &ds_next->ds_prev)); - } else { - ds_next->ds_prev = NULL; + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); } - } - dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); - /* - * NB: unique_bytes is not accurate for head objsets - * because we don't update it when we delete the most - * recent snapshot -- see above comment. - */ - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsmed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); } else { /* * There's no next snapshot, so this is a head dataset. @@ -1263,76 +1650,106 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * - * XXX we're doing this long task with the config lock held + * NB: this should be very quick, because we already + * freed all the objects in open context. */ - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); + ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ds->ds_phys->ds_unique_bytes == 0); } err = zio_wait(zio); ASSERT3U(err, ==, 0); - dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); - - if (ds->ds_phys->ds_snapnames_zapobj) { - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dataset */ + /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - /* - * dsl_dir_sync_destroy() called us, they'll destroy - * the dataset. - */ + ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); + err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); + ASSERT(err == 0); } else { /* remove from snapshot namespace */ dsl_dataset_t *ds_head; - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL, - DS_MODE_NONE, FTAG, &ds_head)); + ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY(0 == dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; - err = zap_lookup(mos, - ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &val); + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); ASSERT3U(err, ==, 0); ASSERT3U(val, ==, obj); } #endif - err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); + err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); ASSERT(err == 0); - dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds_head, FTAG); } if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); - - spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag); + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT(0 == zap_count(mos, + ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + VERIFY(0 == dmu_object_free(mos, + ds->ds_phys->ds_next_clones_obj, tx)); + } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + dsl_dir_close(ds->ds_dir, ds); + ds->ds_dir = NULL; + dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); +} +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + return (ENOSPC); + + /* + * Propogate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); } /* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = arg1; const char *snapname = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; uint64_t value; @@ -1346,8 +1763,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * Check for conflicting name snapshot name. */ - err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &value); + err = dsl_dataset_snap_lookup(ds, snapname, &value); if (err == 0) return (EEXIST); if (err != ENOENT) @@ -1360,34 +1776,44 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) return (ENAMETOOLONG); + err = dsl_dataset_snapshot_reserve_space(ds, tx); + if (err) + return (err); + ds->ds_trysnap_txg = tx->tx_txg; return (0); } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = arg1; const char *snapname = arg2; dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; - uint64_t dsobj; + uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; int err; - spa_scrub_restart(dp->dp_spa, tx->tx_txg); ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + /* + * The origin's ds_creation_txg has to be < TXG_INITIAL + */ + if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) + crtxg = 1; + else + crtxg = tx->tx_txg; + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; @@ -1395,7 +1821,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; @@ -1406,6 +1832,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); if (ds->ds_prev) { + uint64_t next_clones_obj = + ds->ds_prev->ds_phys->ds_next_clones_obj; ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object || ds->ds_prev->ds_phys->ds_num_children > 1); @@ -1414,15 +1842,33 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + } else if (next_clones_obj != 0) { + VERIFY3U(0, ==, zap_remove_int(mos, + next_clones_obj, dsphys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(mos, + next_clones_obj, dsobj, tx)); } } + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + add, 0, 0, tx); + } + bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; - ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; + ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ds->ds_phys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, @@ -1434,10 +1880,14 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(err == 0); if (ds->ds_prev) - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, snapname, - DS_MODE_NONE, ds, &ds->ds_prev)); + dsl_dataset_drop_ref(ds->ds_prev, ds); + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + + dsl_pool_ds_snapshotted(ds, tx); + + spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + "dataset = %llu", dsobj); } void @@ -1447,22 +1897,38 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ASSERT(ds->ds_user_ptr != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + dsl_dir_dirty(ds->ds_dir, tx); dmu_objset_sync(ds->ds_user_ptr, zio, tx); - /* Unneeded? bplist_close(&ds->ds_deadlist); */ } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { + uint64_t refd, avail, uobjs, aobjs; + dsl_dir_stats(ds->ds_dir, nv); + dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ds->ds_phys->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ds->ds_phys->ds_creation_txg); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, - ds->ds_phys->ds_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + ds->ds_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + ds->ds_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + ds->ds_phys->ds_guid); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -1483,29 +1949,29 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; + stat->dds_guid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; } /* clone origin is really a dsl_dir thing... */ - if (ds->ds_dir->dd_phys->dd_clone_parent_obj) { + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool, - ds->ds_dir->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ods)); - dsl_dataset_name(ods, stat->dds_clone_of); - dsl_dataset_close(ods, DS_MODE_NONE, FTAG); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); + dsl_dataset_name(ods, stat->dds_origin); + dsl_dataset_drop_ref(ods, FTAG); } + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { - return (ds->ds_phys->ds_fsid_guid); + return (ds->ds_fsid_guid); } void @@ -1515,10 +1981,37 @@ dsl_dataset_space(dsl_dataset_t *ds, { *refdbytesp = ds->ds_phys->ds_used_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) + *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } *usedobjsp = ds->ds_phys->ds_bp.blk_fill; *availobjsp = DN_MAX_OBJECT - *usedobjsp; } +boolean_t +dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + if (ds->ds_prev == NULL) + return (B_FALSE); + if (ds->ds_phys->ds_bp.blk_birth > + ds->ds_prev->ds_phys->ds_creation_txg) + return (B_TRUE); + return (B_FALSE); +} + /* ARGSUSED */ static int dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1526,20 +2019,18 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; uint64_t val; int err; - err = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds); + err = dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); if (err) return (err); /* new name better not be in use */ - err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj, - newsnapname, 8, 1, &val); - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); + err = dsl_dataset_snap_lookup(hds, newsnapname, &val); + dsl_dataset_rele(hds, FTAG); if (err == 0) err = EEXIST; @@ -1554,10 +2045,11 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; + const char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; @@ -1565,12 +2057,11 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); + err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); ASSERT3U(err, ==, 0); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, newsnapname); @@ -1579,10 +2070,12 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT3U(err, ==, 0); - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + dsl_dataset_rele(hds, FTAG); } -struct renamearg { +struct renamesnaparg { dsl_sync_task_group_t *dstg; char failed[MAXPATHLEN]; char *oldsnap; @@ -1592,7 +2085,7 @@ struct renamearg { static int dsl_snapshot_rename_one(char *name, void *arg) { - struct renamearg *ra = arg; + struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; char *cp; int err; @@ -1600,25 +2093,33 @@ dsl_snapshot_rename_one(char *name, void *arg) cp = name + strlen(name); *cp = '@'; (void) strcpy(cp + 1, ra->oldsnap); - err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD, - ra->dstg, &ds); + + /* + * For recursive snapshot renames the parent won't be changing + * so we just pass name for both the to/from argument. + */ + err = zfs_secpolicy_rename_perms(name, name, CRED()); if (err == ENOENT) { - *cp = '\0'; return (0); - } - if (err) { + } else if (err) { (void) strcpy(ra->failed, name); - *cp = '\0'; - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); return (err); } #ifdef _KERNEL - /* for all filesystems undergoing rename, we'll need to unmount it */ + /* + * For all filesystems undergoing rename, we'll need to unmount it. + */ (void) zfs_unmount_snap(name, NULL); #endif - + err = dsl_dataset_hold(name, ra->dstg, &ds); *cp = '\0'; + if (err == ENOENT) { + return (0); + } else if (err) { + (void) strcpy(ra->failed, name); + return (err); + } dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); @@ -1630,7 +2131,7 @@ static int dsl_recursive_rename(char *oldname, const char *newname) { int err; - struct renamearg *ra; + struct renamesnaparg *ra; dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); @@ -1640,19 +2141,12 @@ dsl_recursive_rename(char *oldname, const char *newname) cp = strchr(fsname, '@'); *cp = '\0'; - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) { kmem_free(fsname, len + 1); return (err); } - ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP); + ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ra->oldsnap = strchr(oldname, '@') + 1; @@ -1675,21 +2169,32 @@ dsl_recursive_rename(char *oldname, const char *newname) (void) strcat(ra->failed, "@"); (void) strcat(ra->failed, ra->newsnap); } - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); + dsl_dataset_rele(ds, ra->dstg); } - (void) strcpy(oldname, ra->failed); + if (err) + (void) strcpy(oldname, ra->failed); dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamearg)); + kmem_free(ra, sizeof (struct renamesnaparg)); spa_close(spa, FTAG); return (err); } +static int +dsl_valid_rename(char *oldname, void *arg) +{ + int delta = *(int *)arg; + + if (strlen(oldname) + delta >= MAXNAMELEN) + return (ENAMETOOLONG); + + return (0); +} + #pragma weak dmu_objset_rename = dsl_dataset_rename int -dsl_dataset_rename(char *oldname, const char *newname, - boolean_t recursive) +dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) { dsl_dir_t *dd; dsl_dataset_t *ds; @@ -1700,7 +2205,15 @@ dsl_dataset_rename(char *oldname, const char *newname, if (err) return (err); if (tail == NULL) { - err = dsl_dir_rename(dd, newname); + int delta = strlen(newname) - strlen(oldname); + + /* if we're growing, validate child name lengths */ + if (delta > 0) + err = dmu_objset_find(oldname, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + if (!err) + err = dsl_dir_rename(dd, newname); dsl_dir_close(dd, FTAG); return (err); } @@ -1723,8 +2236,7 @@ dsl_dataset_rename(char *oldname, const char *newname, if (recursive) { err = dsl_recursive_rename(oldname, newname); } else { - err = dsl_dataset_open(oldname, - DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds); + err = dsl_dataset_hold(oldname, FTAG, &ds); if (err) return (err); @@ -1732,278 +2244,640 @@ dsl_dataset_rename(char *oldname, const char *newname, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); - dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + dsl_dataset_rele(ds, FTAG); } return (err); } +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; +}; + struct promotearg { - uint64_t used, comp, uncomp, unique; - uint64_t newnext_obj, snapnames_obj; + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin, *origin_head; + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; }; +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); + +/* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds = NULL; - dsl_dataset_t *pivot_ds = NULL; - dsl_dataset_t *newnext_ds = NULL; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; int err; - char *name = NULL; - uint64_t itor = 0; - blkptr_t bp; - - bzero(pa, sizeof (*pa)); - /* Check that it is a clone */ - if (dd->dd_phys->dd_clone_parent_obj == 0) + /* Check that it is a real clone */ + if (!dsl_dir_is_clone(hds->ds_dir)) return (EINVAL); /* Since this is so expensive, don't do the preliminary check */ if (!dmu_tx_is_syncing(tx)) return (0); - if (err = dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)) - goto out; - pdd = pivot_ds->ds_dir; - - { - dsl_dataset_t *phds; - if (err = dsl_dataset_open_obj(dd->dd_pool, - pdd->dd_phys->dd_head_dataset_obj, - NULL, DS_MODE_NONE, FTAG, &phds)) - goto out; - pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(phds, DS_MODE_NONE, FTAG); - } - - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { - err = EXDEV; - goto out; - } - - /* find pivot point's new next ds */ - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object, - NULL, DS_MODE_NONE, FTAG, &newnext_ds)); - while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) { - dsl_dataset_t *prev; - - if (err = dsl_dataset_open_obj(dd->dd_pool, - newnext_ds->ds_phys->ds_prev_snap_obj, - NULL, DS_MODE_NONE, FTAG, &prev)) - goto out; - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - newnext_ds = prev; - } - pa->newnext_obj = newnext_ds->ds_object; + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) + return (EXDEV); - /* compute pivot point's new unique space */ - while ((err = bplist_iterate(&newnext_ds->ds_deadlist, - &itor, &bp)) == 0) { - if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg) - pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp); - } - if (err != ENOENT) - goto out; + /* compute origin's new unique space */ + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + err = bplist_space_birthrange(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); + if (err) + return (err); - /* Walk the snapshots that we are moving */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { + /* + * Walk the snapshots that we are moving + * + * Compute space to transfer. Consider the incremental changes + * to used for each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) + * So a sequence would look like: + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) + * Which simplifies to: + * uN + kN + kN-1 + ... + k1 + k0 + * Note however, if we stop before we reach the ORIGIN we get: + * uN + kN + kN-1 + ... + kM - uM-1 + */ + pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->comp = origin_ds->ds_phys->ds_compressed_bytes; + pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; - dsl_dataset_t *prev; + dsl_dataset_t *ds = snap->ds; /* Check that the snapshot name does not conflict */ - dsl_dataset_name(ds, name); - err = zap_lookup(dd->dd_pool->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, - 8, 1, &val); - if (err != ENOENT) { - if (err == 0) - err = EEXIST; - goto out; - } - - /* - * compute space to transfer. Each snapshot gave birth to: - * (my used) - (prev's used) + (deadlist's used) - */ - pa->used += ds->ds_phys->ds_used_bytes; - pa->comp += ds->ds_phys->ds_compressed_bytes; - pa->uncomp += ds->ds_phys->ds_uncompressed_bytes; + VERIFY(0 == dsl_dataset_get_snapname(ds)); + err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); - /* If we reach the first snapshot, we're done. */ + /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) - break; + continue; if (err = bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp)) - goto out; - if (err = dsl_dataset_open_obj(dd->dd_pool, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)) - goto out; - pa->used += dlused - prev->ds_phys->ds_used_bytes; - pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes; - pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes; + return (err); + pa->used += dlused; + pa->comp += dlcomp; + pa->uncomp += dluncomp; + } - /* - * We could be a clone of a clone. If we reach our - * parent's branch point, we're done. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; - } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; + /* + * If we are a clone of a clone then we never reached ORIGIN, + * so we need to subtract out the clone origin's used space. + */ + if (pa->origin_origin) { + pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; + pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ - err = dsl_dir_transfer_possible(pdd, dd, pa->used); + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + pa->used); + if (err) + return (err); -out: - if (ds && ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - if (pivot_ds) - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - if (newnext_ds) - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - if (name) - kmem_free(name, MAXPATHLEN); - return (err); + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> bplist_space_birthrange() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&pa->origin_snaps); + err = snaplist_space(&pa->shared_snaps, + snap->ds->ds_origin_txg, &pa->cloneusedsnap); + if (err) + return (err); + + err = snaplist_space(&pa->clone_snaps, + snap->ds->ds_origin_txg, &space); + if (err) + return (err); + pa->cloneusedsnap += space; + } + if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&pa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); + if (err) + return (err); + } + + return (0); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_t *origin_head; dsl_dir_t *dd = hds->ds_dir; dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds, *pivot_ds; - char *name; + dsl_dir_t *odd = NULL; + uint64_t oldnext_obj; + int64_t delta; - ASSERT(dd->dd_phys->dd_clone_parent_obj != 0); ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); - VERIFY(0 == dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)); + snap = list_head(&pa->origin_snaps); + origin_head = snap->ds; + /* - * We need to explicitly open pdd, since pivot_ds's pdd will be + * We need to explicitly open odd, since origin_ds's dd will be * changing. */ - VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object, - NULL, FTAG, &pdd)); + VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + NULL, FTAG, &odd)); + + /* change origin's next snap */ + dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); + oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; + + /* change the origin's next clone */ + if (origin_ds->ds_phys->ds_next_clones_obj) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + origin_ds->ds_phys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + oldnext_obj, tx)); + } - /* move snapshots to this dir */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { - dsl_dataset_t *prev; + /* change origin */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); + dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; + hds->ds_origin_txg = origin_head->ds_origin_txg; + dmu_buf_will_dirty(odd->dd_dbuf, tx); + odd->dd_phys->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + /* move snapshots to this dir */ + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { + dsl_dataset_t *ds = snap->ds; + + /* unregister props as dsl_dir is changing */ + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } /* move snap name entry */ - dsl_dataset_name(ds, name); - VERIFY(0 == zap_remove(dp->dp_meta_objset, - pa->snapnames_obj, ds->ds_snapname, tx)); + VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY(0 == dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); - /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object); + ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; - ASSERT3P(ds->ds_dir, ==, pdd); + ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_close(ds->ds_dir, ds); VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); ASSERT3U(dsl_prop_numcb(ds), ==, 0); + } - if (ds->ds_phys->ds_prev_snap_obj == 0) - break; + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = pa->cloneusedsnap - + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(pa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + pa->used - delta, pa->comp, pa->uncomp, tx); + + delta = pa->originusedsnap - + odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(pa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -pa->used - delta, -pa->comp, -pa->uncomp, tx); + + origin_ds->ds_phys->ds_unique_bytes = pa->unique; + + /* log history record */ + spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", hds->ds_object); + + dsl_dir_close(odd, FTAG); +} + +static char *snaplist_tag = "snaplist"; +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, boolean_t own, + uint64_t first_obj, uint64_t last_obj, list_t *l) +{ + uint64_t obj = last_obj; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)); + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; + if (own) { + err = dsl_dataset_own_obj(dp, obj, + 0, snaplist_tag, &ds); + if (err == 0) + dsl_dataset_make_exclusive(ds, snaplist_tag); + } else { + err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); + } + if (err == ENOENT) { + /* lost race with snapshot destroy */ + struct promotenode *last = list_tail(l); + ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); + obj = last->ds->ds_phys->ds_prev_snap_obj; + continue; + } else if (err) { + return (err); } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; + + if (first_obj == 0) + first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + + snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = ds->ds_phys->ds_prev_snap_obj; } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - /* change pivot point's next snap */ - dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx); - pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; + return (0); +} - /* change clone_parent-age */ - dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object); - dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj; - dmu_buf_will_dirty(pdd->dd_dbuf, tx); - pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object; +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; - /* change space accounting */ - dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx); - dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); - pivot_ds->ds_phys->ds_unique_bytes = pa->unique; + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used; + int err = bplist_space_birthrange(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used); + if (err) + return (err); + *spacep += used; + } + return (0); +} - dsl_dir_close(pdd, FTAG); - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(name, MAXPATHLEN); +static void +snaplist_destroy(list_t *l, boolean_t own) +{ + struct promotenode *snap; + + if (!list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + if (own) + dsl_dataset_disown(snap->ds, snaplist_tag); + else + dsl_dataset_rele(snap->ds, snaplist_tag); + kmem_free(snap, sizeof (struct promotenode)); + } + list_destroy(l); } +/* + * Promote a clone. Nomenclature note: + * "clone" or "cds": the original clone which is being promoted + * "origin" or "ods": the snapshot which is originally clone's origin + * "origin head" or "ohds": the dataset which is the head + * (filesystem/volume) for the origin + * "origin origin": the origin of the origin's filesystem (typically + * NULL, indicating that the clone is not a clone of a clone). + */ int dsl_dataset_promote(const char *name) { dsl_dataset_t *ds; - int err; + dsl_dir_t *dd; + dsl_pool_t *dp; dmu_object_info_t doi; - struct promotearg pa; + struct promotearg pa = { 0 }; + struct promotenode *snap; + int err; - err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds); + err = dsl_dataset_hold(name, FTAG, &ds); if (err) return (err); + dd = ds->ds_dir; + dp = dd->dd_pool; - err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset, + err = dmu_object_info(dp->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, &doi); if (err) { - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } + if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* + * We are going to inherit all the snapshots taken before our + * origin (i.e., our new origin will be our parent's origin). + * Take ownership of them so that we can rename them into our + * namespace. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, + &pa.shared_snaps); + if (err != 0) + goto out; + + err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); + if (err != 0) + goto out; + + snap = list_head(&pa.shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); + err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); + if (err != 0) + goto out; + + if (dsl_dir_is_clone(snap->ds->ds_dir)) { + err = dsl_dataset_own_obj(dp, + snap->ds->ds_dir->dd_phys->dd_origin_obj, + 0, FTAG, &pa.origin_origin); + if (err != 0) + goto out; + } + +out: + rw_exit(&dp->dp_config_rwlock); + /* * Add in 128x the snapnames zapobj size, since we will be moving * a bunch of snapnames to the promoted ds, and dirtying their * bonus buffers. */ - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + if (err == 0) { + err = dsl_sync_task_do(dp, dsl_dataset_promote_check, + dsl_dataset_promote_sync, ds, &pa, + 2 + 2 * doi.doi_physical_blks); + } + + snaplist_destroy(&pa.shared_snaps, B_TRUE); + snaplist_destroy(&pa.clone_snaps, B_FALSE); + snaplist_destroy(&pa.origin_snaps, B_FALSE); + if (pa.origin_origin) + dsl_dataset_disown(pa.origin_origin, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } +struct cloneswaparg { + dsl_dataset_t *cds; /* clone dataset */ + dsl_dataset_t *ohds; /* origin's head dataset */ + boolean_t force; + int64_t unused_refres_delta; /* change in unconsumed refreservation */ +}; + +/* ARGSUSED */ +static int +dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + + /* they should both be heads */ + if (dsl_dataset_is_snapshot(csa->cds) || + dsl_dataset_is_snapshot(csa->ohds)) + return (EINVAL); + + /* the branch point should be just before them */ + if (csa->cds->ds_prev != csa->ohds->ds_prev) + return (EINVAL); + + /* cds should be the clone */ + if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != + csa->ohds->ds_object) + return (EINVAL); + + /* the clone should be a child of the origin */ + if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) + return (EINVAL); + + /* ohds shouldn't be modified unless 'force' */ + if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) + return (ETXTBSY); + + /* adjust amount of any unconsumed refreservation */ + csa->unused_refres_delta = + (int64_t)MIN(csa->ohds->ds_reserved, + csa->ohds->ds_phys->ds_unique_bytes) - + (int64_t)MIN(csa->ohds->ds_reserved, + csa->cds->ds_phys->ds_unique_bytes); + + if (csa->unused_refres_delta > 0 && + csa->unused_refres_delta > + dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; + + ASSERT(csa->cds->ds_reserved == 0); + ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); + + dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); + + if (csa->cds->ds_user_ptr != NULL) { + csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); + csa->cds->ds_user_ptr = NULL; + } + + if (csa->ohds->ds_user_ptr != NULL) { + csa->ohds->ds_user_evict_func(csa->ohds, + csa->ohds->ds_user_ptr); + csa->ohds->ds_user_ptr = NULL; + } + + /* reset origin's unique bytes */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); + + /* swap blkptrs */ + { + blkptr_t tmp; + tmp = csa->ohds->ds_phys->ds_bp; + csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; + csa->cds->ds_phys->ds_bp = tmp; + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(csa->cds->ds_dir->dd_phys-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, + &cdl_comp, &cdl_uncomp)); + VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, + &odl_comp, &odl_uncomp)); + + dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - + (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - + (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + + cdl_uncomp - + (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); + + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + + /* swap ds_*_bytes */ + SWITCH64(csa->ohds->ds_phys->ds_used_bytes, + csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, + csa->cds->ds_phys->ds_compressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, + csa->cds->ds_phys->ds_uncompressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, + csa->cds->ds_phys->ds_unique_bytes); + + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, + csa->unused_refres_delta, 0, 0, tx); + + /* swap deadlists */ + bplist_close(&csa->cds->ds_deadlist); + bplist_close(&csa->ohds->ds_deadlist); + SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, + csa->cds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, + csa->cds->ds_phys->ds_deadlist_obj)); + VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, + csa->ohds->ds_phys->ds_deadlist_obj)); +} + +/* + * Swap 'clone' with its origin head file system. Used at the end + * of "online recv" to swizzle the file system to the new version. + */ +int +dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force) +{ + struct cloneswaparg csa; + int error; + + ASSERT(clone->ds_owner); + ASSERT(origin_head->ds_owner); +retry: + /* Need exclusive access for the swap */ + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + rw_exit(&clone->ds_rwlock); + rw_enter(&origin_head->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { + rw_exit(&origin_head->ds_rwlock); + goto retry; + } + } + csa.cds = clone; + csa.ohds = origin_head; + csa.force = force; + error = dsl_sync_task_do(clone->ds_dir->dd_pool, + dsl_dataset_clone_swap_check, + dsl_dataset_clone_swap_sync, &csa, NULL, 9); + return (error); +} + /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. @@ -2013,23 +2887,220 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { spa_t *spa; dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; + dsl_dataset_t *ds; int error; if ((error = spa_open(pname, &spa, FTAG)) != 0) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_open_obj(dp, obj, - NULL, DS_MODE_NONE, FTAG, &ds)) != 0) { - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - return (error); + if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); } - dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); + return (error); +} + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + error = ERESTART; + else + error = EDQUOT; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +/* ARGSUSED */ +static int +dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) + return (ENOTSUP); + + if (new_quota == 0) + return (0); + + if (new_quota < ds->ds_phys->ds_used_bytes || + new_quota < ds->ds_reserved) + return (ENOSPC); + return (0); } + +/* ARGSUSED */ +void +dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + ds->ds_quota = new_quota; + + dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + + spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, ds->ds_object); +} + +int +dsl_dataset_set_quota(const char *dsname, uint64_t quota) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + if (quota != ds->ds_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, + ds, "a, 0); + } + dsl_dataset_rele(ds, FTAG); + return (err); +} + +static int +dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + int64_t delta; + uint64_t unique; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_REFRESERVATION) + return (ENOTSUP); + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); + mutex_exit(&ds->ds_lock); + + if (delta > 0 && + delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (delta > 0 && ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t unique; + int64_t delta; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(0, (int64_t)(new_reservation - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = new_reservation; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + new_reservation, cr, tx); + + spa_history_internal_log(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, ds->ds_object); +} + +int +dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_reservation_check, + dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_rele(ds, FTAG); + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c new file mode 100644 index 000000000000..2ce16fe20e12 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -0,0 +1,735 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DSL permissions are stored in a two level zap attribute + * mechanism. The first level identifies the "class" of + * entry. The class is identified by the first 2 letters of + * the attribute. The second letter "l" or "d" identifies whether + * it is a local or descendent permission. The first letter + * identifies the type of entry. + * + * ul$<id> identifies permissions granted locally for this userid. + * ud$<id> identifies permissions granted on descendent datasets for + * this userid. + * Ul$<id> identifies permission sets granted locally for this userid. + * Ud$<id> identifies permission sets granted on descendent datasets for + * this userid. + * gl$<id> identifies permissions granted locally for this groupid. + * gd$<id> identifies permissions granted on descendent datasets for + * this groupid. + * Gl$<id> identifies permission sets granted locally for this groupid. + * Gd$<id> identifies permission sets granted on descendent datasets for + * this groupid. + * el$ identifies permissions granted locally for everyone. + * ed$ identifies permissions granted on descendent datasets + * for everyone. + * El$ identifies permission sets granted locally for everyone. + * Ed$ identifies permission sets granted to descendent datasets for + * everyone. + * c-$ identifies permission to create at dataset creation time. + * C-$ identifies permission sets to grant locally at dataset creation + * time. + * s-$@<name> permissions defined in specified set @<name> + * S-$@<name> Sets defined in named set @<name> + * + * Each of the above entities points to another zap attribute that contains one + * attribute for each allowed permission, such as create, destroy,... + * All of the "upper" case class types will specify permission set names + * rather than permissions. + * + * Basically it looks something like this: + * ul$12 -> ZAP OBJ -> permissions... + * + * The ZAP OBJ is referred to as the jump object. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/cred.h> +#include <sys/sunddi.h> + +#include "zfs_deleg.h" + +/* + * Validate that user is allowed to delegate specified permissions. + * + * In order to delegate "create" you must have "create" + * and "allow". + */ +int +dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + nvlist_t *perms; + nvpair_t *permpair = NULL; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + + if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) + return (EPERM); + + if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) + return (error); + } + } + return (0); +} + +/* + * Validate that user is allowed to unallow specified permissions. They + * must have the 'allow' permission, and even then can only unallow + * perms for their uid. + */ +int +dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + char idstr[32]; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + (void) snprintf(idstr, sizeof (idstr), "%lld", + (longlong_t)crgetuid(cr)); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; + + if (type != ZFS_DELEG_USER && + type != ZFS_DELEG_USER_SETS) + return (EPERM); + + if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) + return (EPERM); + } + return (0); +} + +static void +dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, + DMU_OT_NONE, 0, tx); + VERIFY(zap_update(mos, zapobj, + whokey, 8, 1, &jumpobj, tx) == 0); + } + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + VERIFY(zap_update(mos, jumpobj, + perm, 8, 1, &n, tx) == 0); + spa_history_internal_log(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +static void +dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + return; + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + if (nvpair_value_nvlist(whopair, &perms) != 0) { + if (zap_lookup(mos, zapobj, whokey, 8, + 1, &jumpobj) == 0) { + (void) zap_remove(mos, zapobj, whokey, tx); + VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s dataset = %llu", whokey, + dd->dd_phys->dd_head_dataset_obj); + continue; + } + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) + continue; + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + (void) zap_remove(mos, jumpobj, perm, tx); + if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { + (void) zap_remove(mos, zapobj, + whokey, tx); + VERIFY(0 == zap_destroy(mos, + jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_dir_t *dd; + int error; + nvpair_t *whopair = NULL; + int blocks_modified = 0; + + error = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (error) + return (error); + + if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dir_close(dd, FTAG); + return (ENOTSUP); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) + blocks_modified++; + + error = dsl_sync_task_do(dd->dd_pool, NULL, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + dd, nvp, blocks_modified); + dsl_dir_close(dd, FTAG); + + return (error); +} + +/* + * Find all 'allow' permissions from a given point and then continue + * traversing up to the root. + * + * This function constructs an nvlist of nvlists. + * each setpoint is an nvlist composed of an nvlist of an nvlist + * of the individual * users/groups/everyone/create + * permissions. + * + * The nvlist will look like this. + * + * { source fsname -> { whokeys { permissions,...}, ...}} + * + * The fsname nvpairs will be arranged in a bottom up order. For example, + * if we have the following structure a/b/c then the nvpairs for the fsnames + * will be ordered a/b/c, a/b, a. + */ +int +dsl_deleg_get(const char *ddname, nvlist_t **nvp) +{ + dsl_dir_t *dd, *startdd; + dsl_pool_t *dp; + int error; + objset_t *mos; + + error = dsl_dir_open(ddname, FTAG, &startdd, NULL); + if (error) + return (error); + + dp = startdd->dd_pool; + mos = dp->dp_meta_objset; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = startdd; dd != NULL; dd = dd->dd_parent) { + zap_cursor_t basezc; + zap_attribute_t baseza; + nvlist_t *sp_nvp; + uint64_t n; + char source[MAXNAMELEN]; + + if (dd->dd_phys->dd_deleg_zapobj && + (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, + &n) == 0) && n) { + VERIFY(nvlist_alloc(&sp_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + } else { + continue; + } + + for (zap_cursor_init(&basezc, mos, + dd->dd_phys->dd_deleg_zapobj); + zap_cursor_retrieve(&basezc, &baseza) == 0; + zap_cursor_advance(&basezc)) { + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *perms_nvp; + + ASSERT(baseza.za_integer_length == 8); + ASSERT(baseza.za_num_integers == 1); + + VERIFY(nvlist_alloc(&perms_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + for (zap_cursor_init(&zc, mos, baseza.za_first_integer); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(nvlist_add_boolean(perms_nvp, + za.za_name) == 0); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, + perms_nvp) == 0); + nvlist_free(perms_nvp); + } + + zap_cursor_fini(&basezc); + + dsl_dir_name(dd, source); + VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + nvlist_free(sp_nvp); + } + rw_exit(&dp->dp_config_rwlock); + + dsl_dir_close(startdd, FTAG); + return (0); +} + +/* + * Routines for dsl_deleg_access() -- access checking. + */ +typedef struct perm_set { + avl_node_t p_node; + boolean_t p_matched; + char p_setname[ZFS_MAX_DELEG_NAME]; +} perm_set_t; + +static int +perm_set_compare(const void *arg1, const void *arg2) +{ + const perm_set_t *node1 = arg1; + const perm_set_t *node2 = arg2; + int val; + + val = strcmp(node1->p_setname, node2->p_setname); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * Determine whether a specified permission exists. + * + * First the base attribute has to be retrieved. i.e. ul$12 + * Once the base object has been retrieved the actual permission + * is lookup up in the zap object the base object points to. + * + * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if + * there is no perm in that jumpobj. + */ +static int +dsl_check_access(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, const char *perm) +{ + int error; + uint64_t jumpobj, zero; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error == 0) { + error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); + if (error == ENOENT) + error = EPERM; + } + return (error); +} + +/* + * check a specified user/group for a requested permission + */ +static int +dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, + int checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids; + int i; + uint64_t id; + + /* check for user */ + id = crgetuid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_USER, checkflag, &id, perm) == 0) + return (0); + + /* check for users primary group */ + id = crgetgid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + + /* check for everyone entry */ + id = -1; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) + return (0); + + /* check each supplemental group user is a member of */ + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + } + + return (EPERM); +} + +/* + * Iterate over the sets specified in the specified zapobj + * and load them into the permsets avl tree. + */ +static int +dsl_load_sets(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, avl_tree_t *avl) +{ + zap_cursor_t zc; + zap_attribute_t za; + perm_set_t *permnode; + avl_index_t idx; + uint64_t jumpobj; + int error; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error != 0) + return (error); + + for (zap_cursor_init(&zc, mos, jumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); + (void) strlcpy(permnode->p_setname, za.za_name, + sizeof (permnode->p_setname)); + permnode->p_matched = B_FALSE; + + if (avl_find(avl, permnode, &idx) == NULL) { + avl_insert(avl, permnode, idx); + } else { + kmem_free(permnode, sizeof (perm_set_t)); + } + } + zap_cursor_fini(&zc); + return (0); +} + +/* + * Load all permissions user based on cred belongs to. + */ +static void +dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, + char checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids, i; + uint64_t id; + + id = crgetuid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_USER_SETS, checkflag, &id, avl); + + id = crgetgid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); + + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + } +} + +/* + * Check if user has requested permission. + */ +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_dataset_t *ds; + dsl_dir_t *dd; + dsl_pool_t *dp; + void *cookie; + int error; + char checkflag = ZFS_DELEG_LOCAL; + objset_t *mos; + avl_tree_t permsets; + perm_set_t *setnode; + + error = dsl_dataset_hold(dsname, FTAG, &ds); + if (error) + return (error); + + dp = ds->ds_dir->dd_pool; + mos = dp->dp_meta_objset; + + if (dsl_delegation_on(mos) == B_FALSE) { + dsl_dataset_rele(ds, FTAG); + return (ECANCELED); + } + + if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dataset_rele(ds, FTAG); + return (EPERM); + } + + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), + offsetof(perm_set_t, p_node)); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, + checkflag = ZFS_DELEG_DESCENDENT) { + uint64_t zapobj; + boolean_t expanded; + + /* + * If not in global zone then make sure + * the zoned property is set + */ + if (!INGLOBALZONE(curthread)) { + uint64_t zoned; + + if (dsl_prop_get_dd(dd, + zfs_prop_to_name(ZFS_PROP_ZONED), + 8, 1, &zoned, NULL) != 0) + break; + if (!zoned) + break; + } + zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + continue; + + dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); +again: + expanded = B_FALSE; + for (setnode = avl_first(&permsets); setnode; + setnode = AVL_NEXT(&permsets, setnode)) { + if (setnode->p_matched == B_TRUE) + continue; + + /* See if this set directly grants this permission */ + error = dsl_check_access(mos, zapobj, + ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); + if (error == 0) + goto success; + if (error == EPERM) + setnode->p_matched = B_TRUE; + + /* See if this set includes other sets */ + error = dsl_load_sets(mos, zapobj, + ZFS_DELEG_NAMED_SET_SETS, 0, + setnode->p_setname, &permsets); + if (error == 0) + setnode->p_matched = expanded = B_TRUE; + } + /* + * If we expanded any sets, that will define more sets, + * which we need to check. + */ + if (expanded) + goto again; + + error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); + if (error == 0) + goto success; + } + error = EPERM; +success: + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + cookie = NULL; + while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) + kmem_free(setnode, sizeof (perm_set_t)); + + return (error); +} + +/* + * Other routines. + */ + +static void +copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, + boolean_t dosets, uint64_t uid, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t jumpobj, pjumpobj; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + zap_cursor_t zc; + zap_attribute_t za; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, + ZFS_DELEG_LOCAL, NULL); + if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) + return; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, + ZFS_DELEG_LOCAL, &uid); + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + } + + for (zap_cursor_init(&zc, mos, pjumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zero = 0; + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + + VERIFY(zap_update(mos, jumpobj, za.za_name, + 8, 1, &zero, tx) == 0); + } + zap_cursor_fini(&zc); +} + +/* + * set all create time permission on new dataset. + */ +void +dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) +{ + dsl_dir_t *dd; + uint64_t uid = crgetuid(cr); + + if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return; + + for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { + uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; + + if (pzapobj == 0) + continue; + + copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); + copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); + } +} + +int +dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); + VERIFY(0 == zap_destroy(mos, zapobj, tx)); + return (0); +} + +boolean_t +dsl_delegation_on(objset_t *os) +{ + return (os->os->os_spa->spa_delegation); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 5e563b632909..48d87f97f669 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -19,26 +19,28 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> +#include <sys/dmu_objset.h> #include <sys/dmu_tx.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> #include <sys/spa.h> #include <sys/zap.h> #include <sys/zio.h> #include <sys/arc.h> +#include <sys/sunddi.h> #include "zfs_namecheck.h" -static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx); /* ARGSUSED */ @@ -55,8 +57,6 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) ASSERT(dd->dd_space_towrite[t] == 0); } - ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); - if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); @@ -91,9 +91,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif - /* XXX assert bonus buffer size is correct */ if (dd == NULL) { dsl_dir_t *winner; int err; @@ -103,7 +103,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_dbuf = dbuf; dd->dd_pool = dp; dd->dd_phys = dbuf->db_data; - dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -112,36 +111,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); - if (err) { - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } + if (err) + goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, - ddobj, dd->dd_myname); - } - if (err) { - dsl_dir_close(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); + dd->dd_parent->dd_phys->dd_child_dir_zapobj, + ddobj, 0, dd->dd_myname); } + if (err) + goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } @@ -174,6 +162,15 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); + +errout: + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } void @@ -404,27 +401,37 @@ dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) } uint64_t -dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) +dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, + dmu_tx_t *tx) { - objset_t *mos = pds->dd_pool->dp_meta_objset; + objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *dsphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, - name, sizeof (uint64_t), 1, &ddobj, tx)); + if (pds) { + VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + } else { + /* it's the root dir */ + VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); + } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; dsphys->dd_creation_time = gethrestime_sec(); - dsphys->dd_parent_obj = pds->dd_object; + if (pds) + dsphys->dd_parent_obj = pds->dd_object; dsphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); dsphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); @@ -461,23 +468,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val, obj; + dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); /* Remove our reservation. */ val = 0; - dsl_dir_set_reservation_sync(dd, &val, tx); - ASSERT3U(dd->dd_used_bytes, ==, 0); + dsl_dir_set_reservation_sync(dd, &val, cr, tx); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); VERIFY(0 == zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); @@ -486,65 +497,53 @@ dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) VERIFY(0 == dmu_object_free(mos, obj, tx)); } -void -dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) +boolean_t +dsl_dir_is_clone(dsl_dir_t *dd) { - dsl_dir_phys_t *dsp; - dmu_buf_t *dbuf; - int error; - - *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, - DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - - error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, - sizeof (uint64_t), 1, ddobjp, tx); - ASSERT3U(error, ==, 0); - - VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsp = dbuf->db_data; - - dsp->dd_creation_time = gethrestime_sec(); - dsp->dd_props_zapobj = zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsp->dd_child_dir_zapobj = zap_create(mos, - DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - - dmu_buf_rele(dbuf, FTAG); + return (dd->dd_phys->dd_origin_obj && + (dd->dd_pool->dp_origin_snap == NULL || + dd->dd_phys->dd_origin_obj != + dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, - dsl_dir_space_available(dd, NULL, 0, TRUE)); - mutex_enter(&dd->dd_lock); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, - dd->dd_phys->dd_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dd->dd_phys->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + } mutex_exit(&dd->dd_lock); - if (dd->dd_phys->dd_clone_parent_obj) { + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dd->dd_pool->dp_config_rwlock); - + dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } + rw_exit(&dd->dd_pool->dp_config_rwlock); } void @@ -580,7 +579,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; - dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ @@ -588,15 +586,13 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) } static uint64_t -dsl_dir_estimated_space(dsl_dir_t *dd) +dsl_dir_space_towrite(dsl_dir_t *dd) { - int64_t space; + uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); - space = dd->dd_phys->dd_used_bytes; - ASSERT(space >= 0); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); @@ -630,13 +626,9 @@ dsl_dir_space_available(dsl_dir_t *dd, mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; - if (ondiskonly) { - used = dd->dd_used_bytes; - } else { - used = dsl_dir_estimated_space(dd); - } - if (dd == ancestor) - used += delta; + used = dd->dd_phys->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); @@ -651,6 +643,14 @@ dsl_dir_space_available(dsl_dir_t *dd, parentspace += dd->dd_phys->dd_reserved - used; } + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + if (used > quota) { /* over quota */ myspace = 0; @@ -678,50 +678,68 @@ dsl_dir_space_available(dsl_dir_t *dd, struct tempreserve { list_node_t tr_node; + dsl_pool_t *tr_dp; dsl_dir_t *tr_ds; uint64_t tr_size; }; -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). - */ static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, - uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; - uint64_t est_used, quota, parent_rsrv; - int edquot = EDQUOT; + uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + struct tempreserve *tr; + int enospc = EDQUOT; int txgidx = txg & TXG_MASK; int i; - struct tempreserve *tr; + uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); + /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ - est_used = dsl_dir_estimated_space(dd); + est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) - est_used += dd->dd_tempreserved[i]; + est_inflight += dd->dd_tempreserved[i]; + used_on_disk = dd->dd_phys->dd_used_bytes; - quota = UINT64_MAX; + /* + * On the first iteration, fetch the dataset's used-on-disk and + * refreservation values. Also, if checkrefquota is set, test if + * allocating this space would exceed the dataset's refquota. + */ + if (first && tx->tx_objset) { + int error; + dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + + error = dsl_dataset_check_quota(ds, checkrefquota, + asize, est_inflight, &used_on_disk, &ref_rsrv); + if (error) { + mutex_exit(&dd->dd_lock); + return (error); + } + } - if (dd->dd_phys->dd_quota) + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) + quota = UINT64_MAX; + else quota = dd->dd_phys->dd_quota; /* - * If this transaction will result in a net free of space, we want - * to let it through, but we have to be careful: the space that it - * frees won't become available until *after* this txg syncs. - * Therefore, to ensure that it's possible to remove files from - * a full pool without inducing transient overcommits, we throttle + * Adjust the quota against the actual pool size at the root. + * To ensure that it's possible to remove files from a full + * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of @@ -731,47 +749,45 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); if (poolsize < quota) { quota = poolsize; - edquot = ENOSPC; + enospc = ENOSPC; } - } else if (netfree) { - quota = UINT64_MAX; } /* * If they are requesting more space, and our current estimate - * is over quota. They get to try again unless the actual + * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (asize > 0 && est_used > quota) { - if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || - dd->dd_used_bytes < quota) - edquot = ERESTART; - dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " + if (used_on_disk + est_inflight > quota) { + if (est_inflight > 0 || used_on_disk < quota) + enospc = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", - dd->dd_used_bytes>>10, est_used>>10, - quota>>10, asize>>10, edquot); + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, enospc); mutex_exit(&dd->dd_lock); - return (edquot); + return (enospc); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; - parent_rsrv = parent_delta(dd, est_used, asize); + parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + asize - ref_rsrv); mutex_exit(&dd->dd_lock); - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { + boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, tr_list, tx)); + parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } @@ -779,42 +795,62 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, /* * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). */ int -dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { - int err = 0; + int err; list_t *tr_list; + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, - tr_list, tx); - + err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; - err = arc_tempreserve_space(lsize); - if (err == 0) { - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_ds = NULL; - tr->tr_size = lsize; - list_insert_tail(tr_list, tr); + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + + err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); + } else { + if (err == EAGAIN) { + txg_delay(dd->dd_pool, tx->tx_txg, 1); + err = ERESTART; } + dsl_pool_memory_pressure(dd->dd_pool); + } + + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_dp = dd->dd_pool; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + FALSE, asize > usize, tr_list, tx, TRUE); } if (err) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; + return (err); } @@ -831,15 +867,20 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) ASSERT3U(tx->tx_txg, !=, 0); + if (tr_cookie == NULL) + return; + while (tr = list_head(tr_list)) { - if (tr->tr_ds == NULL) { - arc_tempreserve_clear(tr->tr_size); - } else { + if (tr->tr_dp) { + dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); + } else if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); + } else { + arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); @@ -848,13 +889,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) kmem_free(tr_list, sizeof (list_t)); } -/* - * Call in open context when we think we're going to write/free space, - * eg. when dirtying data. Be conservative (ie. OK to write less than - * this or free more than this, but don't write more or free less). - */ -void -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +static void +dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; @@ -863,7 +899,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - est_used = dsl_dir_estimated_space(dd); + est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -872,39 +908,96 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) - dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); + dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); +} + +/* + * Call in open context when we think we're going to write/free space, + * eg. when dirtying data. Be conservative (ie. OK to write less than + * this or free more than this, but don't write more or free less). + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + dsl_pool_willuse_space(dd->dd_pool, space, tx); + dsl_dir_willuse_space_impl(dd, space, tx); } /* call from syncing context when we actually write/free space for this dd */ void -dsl_dir_diduse_space(dsl_dir_t *dd, +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); dsl_dir_dirty(dd, tx); - mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_used_bytes >= -used); + if (needlock) + mutex_enter(&dd->dd_lock); + accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_used_bytes += used; + dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; - mutex_exit(&dd->dd_lock); + + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dd->dd_phys->dd_used_breakdown[type] >= -used); + dd->dd_phys->dd_used_breakdown[type] += used; +#ifdef DEBUG + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dd->dd_phys->dd_used_breakdown[t]; + ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); +#endif + } + if (needlock) + mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } -/* ARGSUSED */ +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + dsl_dir_dirty(dd, tx); + if (needlock) + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dd->dd_phys->dd_used_breakdown[oldtype] >= delta : + dd->dd_phys->dd_used_breakdown[newtype] >= -delta); + ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dd->dd_phys->dd_used_breakdown[oldtype] -= delta; + dd->dd_phys->dd_used_breakdown[newtype] += delta; + if (needlock) + mutex_exit(&dd->dd_lock); +} + static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -921,22 +1014,22 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the - * pending changes could under-estimat the amount of space to be + * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] + - dd->dd_space_towrite[2] + dd->dd_space_towrite[3]; + towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (new_quota < dd->dd_phys->dd_reserved || - new_quota < dsl_dir_estimated_space(dd))) { + new_quota < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); return (err); } +/* ARGSUSED */ static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; uint64_t *quotap = arg2; @@ -947,6 +1040,10 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); dd->dd_phys->dd_quota = new_quota; mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj); } int @@ -958,20 +1055,22 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota) err = dsl_dir_open(ddname, FTAG, &dd, NULL); if (err) return (err); - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, dd, "a, 0); + if (quota != dd->dd_phys->dd_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, dd, "a, 0); + } dsl_dir_close(dd, FTAG); return (err); } -/* ARGSUSED */ -static int +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; @@ -991,7 +1090,7 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); @@ -1011,8 +1110,9 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } +/* ARGSUSED */ static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; uint64_t *reservationp = arg2; @@ -1020,19 +1120,24 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) uint64_t used; int64_t delta; + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); - mutex_exit(&dd->dd_lock); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_reserved = new_reservation; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); } + mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj); } int @@ -1074,7 +1179,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (delta); mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_used_bytes, delta); + delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } @@ -1084,7 +1189,7 @@ struct renamearg { const char *mynewname; }; -/* ARGSUSED */ +/*ARGSUSED*/ static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1110,7 +1215,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ra->newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ if (closest_common_ancestor(dd, ra->newparent) == dd) @@ -1125,7 +1230,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; @@ -1136,15 +1241,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); if (ra->newparent != dd->dd_parent) { - uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); - - dsl_dir_diduse_space(dd->dd_parent, -myspace, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, myspace, + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); + + if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { + uint64_t unused_rsrv = dd->dd_phys->dd_reserved - + dd->dd_phys->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } } dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1164,6 +1278,9 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT3U(err, ==, 0); + + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); } int @@ -1189,7 +1306,6 @@ dsl_dir_rename(dsl_dir_t *dd, const char *newname) goto out; } - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 00abf7ec2c6b..4585dc805fe5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> @@ -36,20 +34,36 @@ #include <sys/zio.h> #include <sys/zfs_context.h> #include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> + +int zfs_no_write_throttle = 0; +int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ +int zfs_txg_synctime = 5; /* target secs to sync a txg */ + +uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ +uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ +uint64_t zfs_write_limit_inflated = 0; +uint64_t zfs_write_limit_override = 0; +extern uint64_t zfs_write_limit_min; + +kmutex_t zfs_write_limit_lock; + +static pgcnt_t old_physmem = 0; static int -dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) +dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, - MOS_DIR_NAME, sizeof (obj), 1, &obj); + name, sizeof (obj), 1, &obj); if (err) return (err); - return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); + return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * @@ -62,6 +76,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + dp->dp_write_limit = zfs_write_limit_min; txg_init(dp, txg); txg_list_create(&dp->dp_dirty_datasets, @@ -70,9 +85,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t), + list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + return (dp); } @@ -81,9 +99,11 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dsl_dir_t *dd; + dsl_dataset_t *ds; objset_impl_t *osi; - rw_enter(&dp->dp_config_rwlock, RW_READER); + rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); if (err) goto out; @@ -100,10 +120,73 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); + err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; + if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, + FTAG, &ds); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap); + if (err) + goto out; + dsl_dataset_rele(ds, FTAG); + dsl_dir_close(dd, dp); + } + + /* get scrub status */ + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func); + if (err == 0) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &spa->spa_scrub_errors); + if (err) + goto out; + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + /* + * A new-type scrub was in progress on an old + * pool. Restart from the beginning, since the + * old software may have changed the pool in the + * meantime. + */ + dsl_pool_scrub_restart(dp); + } + } else { + /* + * It's OK if there is no scrub in progress (and if + * there was an I/O error, ignore it). + */ + err = 0; + } + out: rw_exit(&dp->dp_config_rwlock); if (err) @@ -117,7 +200,15 @@ out: void dsl_pool_close(dsl_pool_t *dp) { - /* drop our reference from dsl_pool_open() */ + /* drop our references from dsl_pool_open() */ + + /* + * Since we held the origin_snap from "syncing" context (which + * includes pool-opening context), it actually only got a "ref" + * and not a hold, so just drop that here. + */ + if (dp->dp_origin_snap) + dsl_dataset_drop_ref(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_close(dp->dp_mos_dir, dp); if (dp->dp_root_dir) @@ -130,20 +221,27 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_dirs); txg_list_destroy(&dp->dp_sync_tasks); - list_destroy(&dp->dp_synced_objsets); + list_destroy(&dp->dp_synced_datasets); - arc_flush(); + arc_flush(dp->dp_spa); txg_fini(dp); rw_destroy(&dp->dp_config_rwlock); + mutex_destroy(&dp->dp_lock); + mutex_destroy(&dp->dp_scrub_cancel_lock); kmem_free(dp, sizeof (dsl_pool_t)); } dsl_pool_t * -dsl_pool_create(spa_t *spa, uint64_t txg) +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + objset_impl_t *osip; + dsl_dataset_t *ds; + uint64_t dsobj; + + /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = &dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; @@ -153,13 +251,29 @@ dsl_pool_create(spa_t *spa, uint64_t txg) ASSERT3U(err, ==, 0); /* create and open the root dir */ - dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); + dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ - (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + MOS_DIR_NAME, &dp->dp_mos_dir)); + + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) + dsl_pool_create_origin(dp, tx); + + /* create the root dataset */ + dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + + /* create the root objset */ + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + osip = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); +#ifdef _KERNEL + zfs_create_fs(&osip->os, kcred, zplprops, tx); +#endif + dsl_dataset_rele(ds, FTAG); dmu_tx_commit(tx); @@ -175,26 +289,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dataset_t *ds; dsl_sync_task_group_t *dstg; objset_impl_t *mosi = dp->dp_meta_objset->os; + hrtime_t start, write_time; + uint64_t data_written; int err; tx = dmu_tx_create_assigned(dp, txg); + dp->dp_read_overhead = 0; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_objsets, ds); + list_insert_tail(&dp->dp_synced_datasets, ds); else dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } + DTRACE_PROBE(pool_sync__1setup); + + start = gethrtime(); err = zio_wait(zio); + write_time = gethrtime() - start; ASSERT(err == 0); + DTRACE_PROBE(pool_sync__2rootzio); while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) dsl_sync_task_group_sync(dstg, tx); + DTRACE_PROBE(pool_sync__3task); + + start = gethrtime(); while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) dsl_dir_sync(dd, tx); + write_time += gethrtime() - start; + + if (spa_sync_pass(dp->dp_spa) == 1) + dsl_pool_scrub_sync(dp, tx); + start = gethrtime(); if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); @@ -204,8 +334,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } + write_time += gethrtime() - start; + DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, + hrtime_t, dp->dp_read_overhead); + write_time -= dp->dp_read_overhead; dmu_tx_commit(tx); + + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + dp->dp_space_towrite[txg & TXG_MASK] = 0; + ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); + + /* + * If the write limit max has not been explicitly set, set it + * to a fraction of available physical memory (default 1/8th). + * Note that we must inflate the limit because the spa + * inflates write sizes to account for data replication. + * Check this each sync phase to catch changing memory size. + */ + if (physmem != old_physmem && zfs_write_limit_shift) { + mutex_enter(&zfs_write_limit_lock); + old_physmem = physmem; + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + zfs_write_limit_inflated = MAX(zfs_write_limit_min, + spa_get_asize(dp->dp_spa, zfs_write_limit_max)); + mutex_exit(&zfs_write_limit_lock); + } + + /* + * Attempt to keep the sync time consistent by adjusting the + * amount of write traffic allowed into each transaction group. + * Weight the throughput calculation towards the current value: + * thru = 3/4 old_thru + 1/4 new_thru + */ + ASSERT(zfs_write_limit_min > 0); + if (data_written > zfs_write_limit_min / 8 && write_time > 0) { + uint64_t throughput = (data_written * NANOSEC) / write_time; + if (dp->dp_throughput) + dp->dp_throughput = throughput / 4 + + 3 * dp->dp_throughput / 4; + else + dp->dp_throughput = throughput; + dp->dp_write_limit = MIN(zfs_write_limit_inflated, + MAX(zfs_write_limit_min, + dp->dp_throughput * zfs_txg_synctime)); + } } void @@ -213,8 +386,8 @@ dsl_pool_zil_clean(dsl_pool_t *dp) { dsl_dataset_t *ds; - while (ds = list_head(&dp->dp_synced_objsets)) { - list_remove(&dp->dp_synced_objsets, ds); + while (ds = list_head(&dp->dp_synced_datasets)) { + list_remove(&dp->dp_synced_datasets, ds); ASSERT(ds->ds_user_ptr != NULL); zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); dmu_buf_rele(ds->ds_dbuf, ds); @@ -254,3 +427,187 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) return (space - resv); } + +int +dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) +{ + uint64_t reserved = 0; + uint64_t write_limit = (zfs_write_limit_override ? + zfs_write_limit_override : dp->dp_write_limit); + + if (zfs_no_write_throttle) { + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], + space); + return (0); + } + + /* + * Check to see if we have exceeded the maximum allowed IO for + * this transaction group. We can do this without locks since + * a little slop here is ok. Note that we do the reserved check + * with only half the requested reserve: this is because the + * reserve requests are worst-case, and we really don't want to + * throttle based off of worst-case estimates. + */ + if (write_limit > 0) { + reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] + + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; + + if (reserved && reserved > write_limit) + return (ERESTART); + } + + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); + + /* + * If this transaction group is over 7/8ths capacity, delay + * the caller 1 clock tick. This will slow down the "fill" + * rate until the sync process can catch up with us. + */ + if (reserved && reserved > (write_limit - (write_limit >> 3))) + txg_delay(dp, tx->tx_txg, 1); + + return (0); +} + +void +dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); +} + +void +dsl_pool_memory_pressure(dsl_pool_t *dp) +{ + uint64_t space_inuse = 0; + int i; + + if (dp->dp_write_limit == zfs_write_limit_min) + return; + + for (i = 0; i < TXG_SIZE; i++) { + space_inuse += dp->dp_space_towrite[i]; + space_inuse += dp->dp_tempreserved[i]; + } + dp->dp_write_limit = MAX(zfs_write_limit_min, + MIN(dp->dp_write_limit, space_inuse / 4)); +} + +void +dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; + mutex_exit(&dp->dp_lock); + } +} + +/* ARGSUSED */ +static int +upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds, *prev = NULL; + int err; + dsl_pool_t *dp = spa_get_dsl(spa); + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) + break; + dsl_dataset_rele(ds, FTAG); + ds = prev; + prev = NULL; + } + + if (prev == NULL) { + prev = dp->dp_origin_snap; + + /* + * The $ORIGIN can't have any data, or the accounting + * will be wrong. + */ + ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + + /* The origin doesn't get attached to itself */ + if (ds->ds_object == prev->ds_object) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_prev_snap_obj = prev->ds_object; + ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; + + dmu_buf_will_dirty(prev->ds_dbuf, tx); + prev->ds_phys->ds_num_children++; + + if (ds->ds_phys->ds_next_snap_obj == 0) { + ASSERT(ds->ds_prev == NULL); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + } + } + + ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); + ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + + if (prev->ds_phys->ds_next_clones_obj == 0) { + prev->ds_phys->ds_next_clones_obj = + zap_create(dp->dp_meta_objset, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); + + dsl_dataset_rele(ds, FTAG); + if (prev != dp->dp_origin_snap) + dsl_dataset_rele(prev, FTAG); + return (0); +} + +void +dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap != NULL); + + (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + tx, DS_FIND_CHILDREN); +} + +void +dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t dsobj; + dsl_dataset_t *ds; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap == NULL); + + /* create the origin dir, ds, & snap-ds */ + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, + NULL, 0, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap)); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c index 2fff66d06b1e..212acbbc5968 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,14 +44,20 @@ dodefault(const char *propname, int intsz, int numint, void *buf) { zfs_prop_t prop; - if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL || - zfs_prop_readonly(prop)) + /* + * The setonce properties are read-only, BUT they still + * have a default value that can be used as the initial + * value. + */ + if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || + (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) return (ENOENT); - if (zfs_prop_get_type(prop) == prop_type_string) { + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { if (intsz != 1) return (EOVERFLOW); - (void) strncpy(buf, zfs_prop_default_string(prop), numint); + (void) strncpy(buf, zfs_prop_default_string(prop), + numint); } else { if (intsz != 8 || numint < 1) return (EOVERFLOW); @@ -62,13 +68,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf) return (0); } -static int -dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, +int +dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numint, void *buf, char *setpoint) { int err = ENOENT; + objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; + ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + if (setpoint) setpoint[0] = '\0'; @@ -79,7 +88,6 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, * ouside this loop. */ for (; dd != NULL; dd = dd->dd_parent) { - objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, intsz, numint, buf); @@ -92,8 +100,7 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, /* * Break out of this loop for non-inheritable properties. */ - if (prop != ZFS_PROP_INVAL && - !zfs_prop_inheritable(prop)) + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) break; } if (err == ENOENT) @@ -102,6 +109,26 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, return (err); } +int +dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + + if (ds->ds_phys->ds_props_obj) { + int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_props_obj, propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dataset_name(ds, setpoint); + return (err); + } + } + + return (dsl_prop_get_dd(ds->ds_dir, propname, + intsz, numint, buf, setpoint)); +} + /* * Register interest in the named property. We'll call the callback * once to notify it of the current property value, and again each time @@ -114,18 +141,20 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; uint64_t value; dsl_prop_cb_record_t *cbr; int err; int need_rwlock; - need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock); + need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); if (need_rwlock) - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL); + err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); if (err != 0) { - rw_exit(&dd->dd_pool->dp_config_rwlock); + if (need_rwlock) + rw_exit(&dp->dp_config_rwlock); return (err); } @@ -141,46 +170,30 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object, + VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, cbr, &dd)); if (need_rwlock) - rw_exit(&dd->dd_pool->dp_config_rwlock); - /* Leave dataset open until this callback is unregistered */ + rw_exit(&dp->dp_config_rwlock); + /* Leave dir open until this callback is unregistered */ return (0); } int -dsl_prop_get_ds(dsl_dir_t *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - int err; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - return (err); -} - -int -dsl_prop_get(const char *ddname, const char *propname, +dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { - dsl_dir_t *dd; - const char *tail; + dsl_dataset_t *ds; int err; - err = dsl_dir_open(ddname, FTAG, &dd, &tail); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - if (tail && tail[0] != '@') { - dsl_dir_close(dd, FTAG); - return (ENOENT); - } - err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint); + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } @@ -264,8 +277,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_prop_cb_record_t *cbr; objset_t *mos = dp->dp_meta_objset; zap_cursor_t zc; - zap_attribute_t za; + zap_attribute_t *za; int err; + uint64_t dummyval; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); @@ -278,7 +292,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * being inherited here or below; stop the recursion. */ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - 8, 1, &value); + 8, 1, &dummyval); if (err == 0) { dsl_dir_close(dd, FTAG); return; @@ -287,22 +301,34 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (strcmp(cbr->cbr_propname, propname) == 0) { - cbr->cbr_func(cbr->cbr_arg, value); - } + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; + + if (strcmp(cbr->cbr_propname, propname) != 0) + continue; + + /* + * If the property is set on this ds, then it is not + * inherited here; don't call the callback. + */ + if (propobj && 0 == zap_lookup(mos, propobj, propname, + 8, 1, &dummyval)) + continue; + + cbr->cbr_func(cbr->cbr_arg, value); } mutex_exit(&dd->dd_lock); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { - /* XXX recursion could blow stack; esp. za! */ - dsl_prop_changed_notify(dp, za.za_first_integer, + dsl_prop_changed_notify(dp, za->za_first_integer, propname, value, FALSE); } + kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); dsl_dir_close(dd, FTAG); } @@ -316,22 +342,37 @@ struct prop_set_arg { static void -dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; struct prop_set_arg *psa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - uint64_t intval; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj, intval; int isint; + char valbuf[32]; + char *valstr; isint = (dodefault(psa->name, 8, 1, &intval) == 0); + if (dsl_dataset_is_snapshot(ds)) { + ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_SNAP_PROPS); + if (ds->ds_phys->ds_props_obj == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_props_obj = + zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + } + zapobj = ds->ds_phys->ds_props_obj; + } else { + zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; + } + if (psa->numints == 0) { int err = zap_remove(mos, zapobj, psa->name, tx); ASSERT(err == 0 || err == ENOENT); if (isint) { - VERIFY(0 == dsl_prop_get_impl(dd->dd_parent, + VERIFY(0 == dsl_prop_get_ds(ds, psa->name, 8, 1, &intval, NULL)); } } else { @@ -342,32 +383,63 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) } if (isint) { - dsl_prop_changed_notify(dd->dd_pool, - dd->dd_object, psa->name, intval, TRUE); + if (dsl_dataset_is_snapshot(ds)) { + dsl_prop_cb_record_t *cbr; + /* + * It's a snapshot; nothing can inherit this + * property, so just look for callbacks on this + * ds here. + */ + mutex_enter(&ds->ds_dir->dd_lock); + for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; + cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + strcmp(cbr->cbr_propname, psa->name) == 0) + cbr->cbr_func(cbr->cbr_arg, intval); + } + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dsl_prop_changed_notify(ds->ds_dir->dd_pool, + ds->ds_dir->dd_object, psa->name, intval, TRUE); + } + } + if (isint) { + (void) snprintf(valbuf, sizeof (valbuf), + "%lld", (longlong_t)intval); + valstr = valbuf; + } else { + valstr = (char *)psa->buf; } + spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT : + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, + "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); } -int -dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, const void *buf) +void +dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx) { - struct prop_set_arg psa; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - psa.name = propname; - psa.intsz = intsz; - psa.numints = numints; - psa.buf = buf; + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); + + dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - return (dsl_sync_task_do(dd->dd_pool, - NULL, dsl_prop_set_sync, dd, &psa, 2)); + spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + "%s=%llu dataset = %llu", name, (u_longlong_t)val, + dd->dd_phys->dd_head_dataset_obj); } int -dsl_prop_set(const char *ddname, const char *propname, +dsl_prop_set(const char *dsname, const char *propname, int intsz, int numints, const void *buf) { - dsl_dir_t *dd; + dsl_dataset_t *ds; int err; + struct prop_set_arg psa; /* * We must do these checks before we get to the syncfunc, since @@ -378,11 +450,24 @@ dsl_prop_set(const char *ddname, const char *propname, if (intsz * numints >= ZAP_MAXVALUELEN) return (E2BIG); - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - err = dsl_prop_set_dd(dd, propname, intsz, numints, buf); - dsl_dir_close(dd, FTAG); + + if (dsl_dataset_is_snapshot(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + psa.name = propname; + psa.intsz = intsz; + psa.numints = numints; + psa.buf = buf; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_prop_set_sync, ds, &psa, 2); + + dsl_dataset_rele(ds, FTAG); return (err); } @@ -390,45 +475,55 @@ dsl_prop_set(const char *ddname, const char *propname, * Iterate over all properties for this dataset and return them in an nvlist. */ int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local) { dsl_dataset_t *ds = os->os->os_dsl_dataset; dsl_dir_t *dd = ds->ds_dir; + boolean_t snapshot = dsl_dataset_is_snapshot(ds); int err = 0; - dsl_pool_t *dp; - objset_t *mos; - - if (dsl_dataset_is_snapshot(ds)) { - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - return (0); - } + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t propobj = ds->ds_phys->ds_props_obj; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - dp = dd->dd_pool; - mos = dp->dp_meta_objset; + if (local && snapshot && !propobj) + return (0); rw_enter(&dp->dp_config_rwlock, RW_READER); - for (; dd != NULL; dd = dd->dd_parent) { + while (dd != NULL) { char setpoint[MAXNAMELEN]; zap_cursor_t zc; zap_attribute_t za; + dsl_dir_t *dd_next; + + if (propobj) { + dsl_dataset_name(ds, setpoint); + dd_next = dd; + } else { + dsl_dir_name(dd, setpoint); + propobj = dd->dd_phys->dd_props_zapobj; + dd_next = dd->dd_parent; + } - dsl_dir_name(dd, setpoint); - - for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj); + for (zap_cursor_init(&zc, mos, propobj); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { nvlist_t *propval; - zfs_prop_t prop; - /* - * Skip non-inheritable properties. - */ - if ((prop = zfs_name_to_prop(za.za_name)) != - ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) && - dd != ds->ds_dir) + zfs_prop_t prop = zfs_name_to_prop(za.za_name); + + /* Skip non-inheritable properties. */ + if (prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop) && + (dd != ds->ds_dir || (snapshot && dd != dd_next))) continue; + /* Skip properties not valid for this type. */ + if (snapshot && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined */ if (nvlist_lookup_nvlist(*nvp, za.za_name, &propval) == 0) continue; @@ -441,28 +536,26 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) */ char *tmp = kmem_alloc(za.za_num_integers, KM_SLEEP); - err = zap_lookup(mos, - dd->dd_phys->dd_props_zapobj, - za.za_name, 1, za.za_num_integers, - tmp); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); if (err != 0) { kmem_free(tmp, za.za_num_integers); break; } - VERIFY(nvlist_add_string(propval, - ZFS_PROP_VALUE, tmp) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); kmem_free(tmp, za.za_num_integers); } else { /* * Integer property */ ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, - ZFS_PROP_VALUE, za.za_first_integer); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); } - VERIFY(nvlist_add_string(propval, - ZFS_PROP_SOURCE, setpoint) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, + setpoint) == 0); VERIFY(nvlist_add_nvlist(*nvp, za.za_name, propval) == 0); nvlist_free(propval); @@ -472,6 +565,14 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) if (err != ENOENT) break; err = 0; + /* + * If we are just after the props that have been set + * locally, then we are done after the first iteration. + */ + if (local) + break; + dd = dd_next; + propobj = 0; } rw_exit(&dp->dp_config_rwlock); @@ -484,7 +585,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); nvlist_free(propval); } @@ -495,7 +596,7 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); nvlist_free(propval); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c new file mode 100644 index 000000000000..5f675b787df7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c @@ -0,0 +1,929 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> + +typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scrub_cb_t dsl_pool_scrub_clean_cb; +static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; + +int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ +int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ + +extern int zfs_txg_timeout; + +static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { + NULL, + dsl_pool_scrub_clean_cb +}; + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + enum scrub_func *funcp = arg2; + dmu_object_type_t ot = 0; + boolean_t complete = B_FALSE; + + dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); + + ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); + ASSERT(*funcp > SCRUB_FUNC_NONE); + ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); + + dp->dp_scrub_min_txg = 0; + dp->dp_scrub_max_txg = tx->tx_txg; + + if (*funcp == SCRUB_FUNC_CLEAN) { + vdev_t *rvd = dp->dp_spa->spa_root_vdev; + + /* rewrite all disk labels */ + vdev_config_dirty(rvd); + + if (vdev_resilver_needed(rvd, + &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_RESILVER_START); + dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, + tx->tx_txg); + } + + /* zero out the scrub stats in all vdev_stat_t's */ + vdev_scrub_stat_update(rvd, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + + dp->dp_spa->spa_scrub_started = B_TRUE; + } + + /* back to the generic stuff */ + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + dp->dp_scrub_func = *funcp; + dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + dp->dp_scrub_restart = B_FALSE; + dp->dp_spa->spa_scrub_errors = 0; + + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); +} + +int +dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) +{ + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_setup_sync, dp, &func, 0)); +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + boolean_t *completep = arg2; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + mutex_enter(&dp->dp_scrub_cancel_lock); + + if (dp->dp_scrub_restart) { + dp->dp_scrub_restart = B_FALSE; + *completep = B_FALSE; + } + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_started = B_FALSE; + dp->dp_spa->spa_scrub_active = B_FALSE; + + dp->dp_scrub_func = SCRUB_FUNC_NONE; + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, tx)); + dp->dp_scrub_queue_obj = 0; + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, + "complete=%u", *completep); + + /* below is scrub-clean specific */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, + *completep); + /* + * If the scrub/resilver completed, update all DTLs to reflect this. + * Whether it succeeded or not, vacate all temporary scrub DTLs. + */ + vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, + *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); + if (dp->dp_scrub_min_txg && *completep) + spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_errlog_rotate(dp->dp_spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); + + dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +int +dsl_pool_scrub_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_cancel_sync, dp, &complete, 3)); +} + +int +dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + /* + * This function will be used by bp-rewrite wad to intercept frees. + */ + return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, + done, private, arc_flags)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb1->zb_object != -1ULL); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == -1ULL) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == 0) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == 0) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static boolean_t +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +{ + int elapsed_ticks; + int mintime; + + if (dp->dp_scrub_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb->zb_level != 0) + return (B_FALSE); + + mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : + zfs_scrub_min_time; + elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; + if (elapsed_ticks > hz * zfs_txg_timeout || + (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { + dprintf("pausing at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + dp->dp_scrub_pausing = B_TRUE; + dp->dp_scrub_bookmark = *zb; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_traverse_arg { + dsl_pool_t *zta_dp; + zil_header_t *zta_zh; +} zil_traverse_arg_t; + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); + } +} + +static void +traverse_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_traverse_arg_t zta = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, + claim_txg); + + zil_free(zilog); +} + +static void +scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + int err; + arc_buf_t *buf = NULL; + + if (bp->blk_birth == 0) + return; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (scrub_pause(dp, zb)) + return; + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { + /* + * If we already visited this bp & everything below (in + * a prior txg), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) + return; + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > dp->dp_scrub_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); + } + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + cbp = buf->b_data; + + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + scrub_visitbp(dp, dnp, buf, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *child_dnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + child_dnp = buf->b_data; + + for (i = 0; i < epb; i++, child_dnp++) { + for (j = 0; j < child_dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + child_dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, child_dnp, buf, + &child_dnp->dn_blkptr[j], &czb); + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + + osp = buf->b_data; + + traverse_zil(dp, &osp->os_zil_header); + + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + scrub_visitbp(dp, &osp->os_meta_dnode, buf, + &osp->os_meta_dnode.dn_blkptr[j], &czb); + } + } + + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); + scrub_visitbp(dp, NULL, NULL, bp, &zb); +} + +void +dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) != 0) { + return; + } + + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); +} + +void +dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + dp->dp_scrub_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_prev_snap_obj, tx) == 0); + } +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + dp = ds->ds_dir->dd_pool; + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + uint64_t min_txg_save; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + min_txg_save = dp->dp_scrub_min_txg; + dp->dp_scrub_min_txg = + MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); + scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); + dp->dp_scrub_min_txg = min_txg_save; + + if (dp->dp_scrub_pausing) + goto out; + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } else { + VERIFY(zap_join(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + dp->dp_scrub_queue_obj, tx) == 0); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + + dp = ds->ds_dir->dd_pool; + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + boolean_t complete = B_TRUE; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + /* If the spa is not fully loaded, don't bother. */ + if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + return; + + if (dp->dp_scrub_restart) { + enum scrub_func func = dp->dp_scrub_func; + dp->dp_scrub_restart = B_FALSE; + dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); + } + + if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + /* + * We must have resumed after rebooting; reset the vdev + * stats to know that we're doing a scrub (although it + * will think we're just starting now). + */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + } + + dp->dp_scrub_pausing = B_FALSE; + dp->dp_scrub_start_time = lbolt64; + dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); + dp->dp_spa->spa_scrub_active = B_TRUE; + + if (dp->dp_scrub_bookmark.zb_objset == 0) { + /* First do the MOS & ORIGIN */ + scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); + if (dp->dp_scrub_pausing) + goto out; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!dp->dp_scrub_pausing); + } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset will + * be -1, so we will skip this and find a new objset + * below. + */ + scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); + if (dp->dp_scrub_pausing) + goto out; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + VERIFY(0 == zap_remove(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, za.za_name, tx)); + scrub_visitds(dp, za.za_first_integer, tx); + if (dp->dp_scrub_pausing) + break; + zap_cursor_fini(&zc); + } + zap_cursor_fini(&zc); + if (dp->dp_scrub_pausing) + goto out; + + /* done. */ + + dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); + return; +out: + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); +} + +void +dsl_pool_scrub_restart(dsl_pool_t *dp) +{ + mutex_enter(&dp->dp_scrub_cancel_lock); + dp->dp_scrub_restart = B_TRUE; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +/* + * scrub consumers + */ + +static void +dsl_pool_scrub_clean_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) + spa->spa_scrub_errors++; + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_pool_scrub_clean_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + size_t size = BP_GET_LSIZE(bp); + int d; + spa_t *spa = dp->dp_spa; + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_priority; + + if (dp->dp_scrub_isresilver == 0) { + /* It's a scrub */ + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else { + /* It's a resilver */ + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += + DVA_GET_ASIZE(&bp->blk_dva[d]); + mutex_exit(&vd->vdev_stat_lock); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best we can do is look at the + * pool-wide DTL. + * XXX -- it would be better to change our + * allocation policy to ensure that this can't + * happen. + */ + vd = spa->spa_root_vdev; + } + needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, + bp->blk_birth, 1); + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_pool_scrub_clean_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_pool_scrub_clean(dsl_pool_t *dp) +{ + /* + * Purge all vdev caches. We do this here rather than in sync + * context because this requires a writer lock on the spa_config + * lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); + dp->dp_spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(dp->dp_spa->spa_root_vdev); + dp->dp_spa->spa_scrub_reopen = B_FALSE; + spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + + return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c index 17deb569c4ab..21100225abf7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> +#include <sys/cred.h> #define DST_AVG_BLKSHIFT 14 @@ -49,6 +50,7 @@ dsl_sync_task_group_create(dsl_pool_t *dp) list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), offsetof(dsl_sync_task_t, dst_node)); dstg->dstg_pool = dp; + dstg->dstg_cr = CRED(); return (dstg); } @@ -123,6 +125,16 @@ top: } void +dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +{ + uint64_t txg; + + dstg->dstg_nowaiter = B_TRUE; + txg = dmu_tx_get_txg(tx); + VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); +} + +void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) { dsl_sync_task_t *dst; @@ -146,7 +158,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) * Check for sufficient space. */ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx); + dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); /* don't bother trying again */ if (dstg->dstg_err == ERESTART) dstg->dstg_err = EAGAIN; @@ -171,12 +183,16 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) */ for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, + dstg->dstg_cr, tx); } } rw_exit(&dstg->dstg_pool->dp_config_rwlock); dsl_dir_tempreserve_clear(tr_cookie, tx); + + if (dstg->dstg_nowaiter) + dsl_sync_task_group_destroy(dstg); } int @@ -194,3 +210,16 @@ dsl_sync_task_do(dsl_pool_t *dp, dsl_sync_task_group_destroy(dstg); return (err); } + +void +dsl_sync_task_do_nowait(dsl_pool_t *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) +{ + dsl_sync_task_group_t *dstg; + + dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_create(dstg, checkfunc, syncfunc, + arg1, arg2, blocks_modified); + dsl_sync_task_group_nowait(dstg, tx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 0dba134cef9b..22b56d617799 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa_impl.h> #include <sys/dmu.h> @@ -35,6 +33,7 @@ #include <sys/zio.h> uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * ========================================================================== @@ -341,7 +340,7 @@ metaslab_fini(metaslab_t *msp) int t; vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc); + -msp->ms_smo.smo_alloc, B_TRUE); metaslab_group_remove(mg, msp); @@ -534,8 +533,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); @@ -569,10 +568,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0); + vdev_space_update(vd, sm->sm_size, 0, B_TRUE); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@ -714,11 +713,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * Allocate a block for the specified i/o. */ static int -metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, - dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid) +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { metaslab_group_t *mg, *rotor; - metaslab_class_t *mc; vdev_t *vd; int dshift = 3; int all_zero; @@ -728,7 +726,11 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, ASSERT(!DVA_IS_VALID(&dva[d])); - mc = spa_metaslab_class_select(spa); + /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0) + return (ENOSPC); /* * Start at the rotor and loop through all mgs until we find something. @@ -754,7 +756,7 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (hintdva_avoid) + if (flags & METASLAB_HINTBP_AVOID) mg = vd->vdev_mg->mg_next; else mg = vd->vdev_mg; @@ -764,12 +766,34 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, } else { mg = mc->mc_rotor; } - rotor = mg; + /* + * If the hint put us into the wrong class, just follow the rotor. + */ + if (mg->mg_class != mc) + mg = mc->mc_rotor; + + rotor = mg; top: all_zero = B_TRUE; do { vd = mg->mg_vd; + /* + * Don't allocate from faulted devices. + */ + if (!vdev_writeable(vd)) + goto next; + /* + * Avoid writing single-copy data to a failing vdev + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && dshift == 3) { + all_zero = B_FALSE; + goto next; + } + + ASSERT(mg->mg_class == mc); distance = vd->vdev_asize >> dshift; if (distance <= (1ULL << vd->vdev_ms_shift)) @@ -818,11 +842,12 @@ top: DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); - DVA_SET_GANG(&dva[d], 0); + DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); return (0); } +next: mc->mc_rotor = mg->mg_next; mc->mc_allocated = 0; } while ((mg = mg->mg_next) != rotor); @@ -879,38 +904,6 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); - - /* - * verify that this region is actually allocated in - * either a ms_allocmap or the ms_map - */ - if (msp->ms_map.sm_loaded) { - boolean_t allocd = B_FALSE; - int i; - - if (!space_map_contains(&msp->ms_map, offset, size)) { - allocd = B_TRUE; - } else { - for (i = 0; i < TXG_CONCURRENT_STATES; i++) { - space_map_t *sm = &msp->ms_allocmap - [(txg - i) & TXG_MASK]; - if (space_map_contains(sm, - offset, size)) { - allocd = B_TRUE; - break; - } - } - } - - if (!allocd) { - zfs_panic_recover("freeing free segment " - "(vdev=%llu offset=%llx size=%llx)", - (longlong_t)vdev, (longlong_t)offset, - (longlong_t)size); - } - } - - } mutex_exit(&msp->ms_lock); @@ -946,16 +939,18 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error) { + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_claim(&msp->ms_map, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + } mutex_exit(&msp->ms_lock); @@ -963,32 +958,45 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) } int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, - uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid) +metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int d; int error = 0; + ASSERT(bp->blk_birth == 0); + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (ENOSPC); + } + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - for (d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, - txg, hintbp_avoid); + for (int d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + txg, flags); if (error) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); bzero(&dva[d], sizeof (dva_t)); } + spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); + spa_config_exit(spa, SCL_ALLOC, FTAG); + + bp->blk_birth = txg; + return (0); } @@ -997,12 +1005,16 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int d; ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (d = 0; d < ndvas; d++) + for (int d = 0; d < ndvas; d++) metaslab_free_dva(spa, &dva[d], txg, now); + + spa_config_exit(spa, SCL_FREE, FTAG); } int @@ -1010,14 +1022,28 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int d, error; - int last_error = 0; + int error = 0; ASSERT(!BP_IS_HOLE(bp)); - for (d = 0; d < ndvas; d++) + if (txg != 0) { + /* + * First do a dry run to make sure all DVAs are claimable, + * so we don't have to unwind from partial failures below. + */ + if ((error = metaslab_claim(spa, bp, 0)) != 0) + return (error); + } + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) - last_error = error; + break; + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + ASSERT(error == 0 || txg == 0); - return (last_error); + return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c index a2f4614fed87..5fe4e638055a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,11 +60,13 @@ refcount_fini(void) void refcount_create(refcount_t *rc) { + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&rc->rc_list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link)); - mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); + rc->rc_count = 0; + rc->rc_removed_count = 0; } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c new file mode 100644 index 000000000000..db3b70fc68b0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/refcount.h> +#include <sys/rrwlock.h> + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (B_FALSE); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +static void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + if (rrn_find_and_remove(rrl)) { + if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + + } else { + if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + } + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount)); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c index ce5c26131af5..ca7076cb6fd9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,20 +30,20 @@ #include <sys/zio_checksum.h> /* - * SHA-256 checksum, as specified in FIPS 180-2, available at: - * http://csrc.nist.gov/cryptval + * SHA-256 checksum, as specified in FIPS 180-3, available at: + * http://csrc.nist.gov/publications/PubsFIPS.html * * This is a very compact implementation of SHA-256. * It is designed to be simple and portable, not to be fast. */ /* - * The literal definitions according to FIPS180-2 would be: + * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: * - * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) - * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) + * Ch(x, y, z) (x & y) ^ (~x & z) + * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) * - * We use logical equivalents which require one less op. + * We use equivalent logical reductions here that require one less op. */ #define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) @@ -105,20 +104,19 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; uint8_t pad[128]; - int padsize = size & 63; - int i; + int i, padsize; - for (i = 0; i < size - padsize; i += 64) + for (i = 0; i < (size & ~63ULL); i += 64) SHA256Transform(H, (uint8_t *)buf + i); - for (i = 0; i < padsize; i++) - pad[i] = ((uint8_t *)buf)[i]; + for (padsize = 0; i < size; i++) + pad[padsize++] = *((uint8_t *)buf + i); for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) pad[padsize] = 0; - for (i = 0; i < 8; i++) - pad[padsize++] = (size << 3) >> (56 - 8 * i); + for (i = 56; i >= 0; i -= 8) + pad[padsize++] = (size << 3) >> i; for (i = 0; i < padsize; i += 64) SHA256Transform(H, pad + i); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 6a7c525ae991..163b21572247 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a @@ -56,16 +54,388 @@ #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> #include <sys/fs/zfs.h> +#include <sys/arc.h> #include <sys/callb.h> #include <sys/sunddi.h> +#include <sys/spa_boot.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" -int zio_taskq_threads = 0; -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); -TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, - &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); +int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE INTR */ + { 1, 1 }, /* ZIO_TYPE_NULL */ + { 1, 8 }, /* ZIO_TYPE_READ */ + { 8, 1 }, /* ZIO_TYPE_WRITE */ + { 1, 1 }, /* ZIO_TYPE_FREE */ + { 1, 1 }, /* ZIO_TYPE_CLAIM */ + { 1, 1 }, /* ZIO_TYPE_IOCTL */ +}; +static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static boolean_t spa_has_active_shared_spare(spa_t *spa); + +/* + * ========================================================================== + * SPA properties routines + * ========================================================================== + */ + +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, + uint64_t intval, zprop_source_t src) +{ + const char *propname = zpool_prop_to_name(prop); + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + + if (strval != NULL) + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + else + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* + * Get property values from the spa configuration. + */ +static void +spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +{ + uint64_t size = spa_get_space(spa); + uint64_t used = spa_get_alloc(spa); + uint64_t cap, version; + zprop_source_t src = ZPROP_SRC_NONE; + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa->spa_props_lock)); + + /* + * readonly properties + */ + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); + + cap = (size == 0) ? 0 : (used * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + /* + * settable properties that are not stored in the pool property object. + */ + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + + if (spa->spa_root != NULL) + spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + 0, ZPROP_SRC_LOCAL); + + if ((dp = list_head(&spa->spa_config_list)) != NULL) { + if (dp->scd_path == NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + "none", 0, ZPROP_SRC_LOCAL); + } else if (strcmp(dp->scd_path, spa_config_path) != 0) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + dp->scd_path, 0, ZPROP_SRC_LOCAL); + } + } +} + +/* + * Get zpool property values. + */ +int +spa_prop_get(spa_t *spa, nvlist_t **nvp) +{ + zap_cursor_t zc; + zap_attribute_t za; + objset_t *mos = spa->spa_meta_objset; + int err; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_props_lock); + + /* + * Get properties from the spa config. + */ + spa_prop_get_config(spa, nvp); + + /* If no pool property object, no more prop to get. */ + if (spa->spa_pool_props_object == 0) { + mutex_exit(&spa->spa_props_lock); + return (0); + } + + /* + * Get properties from the MOS pool property object. + */ + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t intval = 0; + char *strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + zpool_prop_t prop; + + if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) + continue; + + switch (za.za_integer_length) { + case 8: + /* integer property */ + if (za.za_first_integer != + zpool_prop_default_numeric(prop)) + src = ZPROP_SRC_LOCAL; + + if (prop == ZPOOL_PROP_BOOTFS) { + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds)) { + rw_exit(&dp->dp_config_rwlock); + break; + } + + strval = kmem_alloc( + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + KM_SLEEP); + dsl_dataset_name(ds, strval); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); + } else { + strval = NULL; + intval = za.za_first_integer; + } + + spa_prop_add_list(*nvp, prop, strval, intval, src); + + if (strval != NULL) + kmem_free(strval, + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + + break; + + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, KM_SLEEP); + err = zap_lookup(mos, spa->spa_pool_props_object, + za.za_name, 1, za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + spa_prop_add_list(*nvp, prop, strval, 0, src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); + mutex_exit(&spa->spa_props_lock); +out: + if (err && err != ENOENT) { + nvlist_free(*nvp); + *nvp = NULL; + return (err); + } + + return (0); +} + +/* + * Validate the given pool properties nvlist and modify the list + * for the property values to be set. + */ +static int +spa_prop_validate(spa_t *spa, nvlist_t *props) +{ + nvpair_t *elem; + int error = 0, reset_b |