aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris
diff options
context:
space:
mode:
authorPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerPawel Jakub Dawidek <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commit1ba4a712dde6e6c613fc411a96958b4ade67de4c (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris
parent8fc061164d74a4c9775f39da3c0b5d02112866c8 (diff)
downloadsrc-1ba4a712dde6e6c613fc411a96958b4ade67de4c.tar.gz
src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.zip
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Notes
Notes: svn path=/head/; revision=185029
Diffstat (limited to 'sys/cddl/contrib/opensolaris')
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.c1598
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.h27
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S12
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S49
-rw-r--r--sys/cddl/contrib/opensolaris/common/avl/avl.c66
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c306
-rw-r--r--sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c2130
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c65
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h44
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c234
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h81
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c84
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h7
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c800
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h91
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c186
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c406
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c397
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c2295
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c53
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c405
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c285
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c597
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c841
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c36
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c403
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2413
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c735
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c496
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c395
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c313
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c929
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c249
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c32
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c3038
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c232
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c121
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c776
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c27
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h131
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h40
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h160
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h72
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h144
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h216
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c1206
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c445
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c547
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c155
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c84
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c180
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c178
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c421
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c2499
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c423
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c326
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c716
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c2294
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c456
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c579
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c796
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c1762
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1101
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c377
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c2470
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c715
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/callb.c31
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/list.c84
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/taskq.c21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr.c58
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/rpc/xdr_array.c15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl.h300
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h61
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h25
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/byteorder.h49
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/callb.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cred.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dkio.h26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dklabel.h33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h454
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/list.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h31
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/processor.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/synch.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h247
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h35376
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h395
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c12
161 files changed, 70855 insertions, 10363 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
index 2f32e7a8a78a..e6b678079635 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,100 +19,245 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
-#include <sys/acl.h>
#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
#if defined(_KERNEL)
+#include <sys/kmem.h>
#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
#include <sys/debug.h>
#else
#include <errno.h>
#include <stdlib.h>
+#include <stddef.h>
#include <strings.h>
+#include <unistd.h>
#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
#define ASSERT assert
#endif
+#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+ ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
+#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
+#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
+#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
+
+#define ACL_WRITE_OWNER_SET_DENY 0x0000010
+#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
+#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
+#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
+
+#define ACL_DELETE_SET_DENY 0x0000100
+#define ACL_DELETE_SET_ALLOW 0x0000200
+#define ACL_DELETE_ERR_DENY 0x0000400
+#define ACL_DELETE_ERR_ALLOW 0x0000800
+
+#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
+#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
+#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
+#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
+
+#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
+#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
+#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
+#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
+
+#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
+#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
+#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
+#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
+
+#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
+#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
+#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
+#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
+
+
+#define ACE_VALID_MASK_BITS (\
+ ACE_READ_DATA | \
+ ACE_LIST_DIRECTORY | \
+ ACE_WRITE_DATA | \
+ ACE_ADD_FILE | \
+ ACE_APPEND_DATA | \
+ ACE_ADD_SUBDIRECTORY | \
+ ACE_READ_NAMED_ATTRS | \
+ ACE_WRITE_NAMED_ATTRS | \
+ ACE_EXECUTE | \
+ ACE_DELETE_CHILD | \
+ ACE_READ_ATTRIBUTES | \
+ ACE_WRITE_ATTRIBUTES | \
+ ACE_DELETE | \
+ ACE_READ_ACL | \
+ ACE_WRITE_ACL | \
+ ACE_WRITE_OWNER | \
+ ACE_SYNCHRONIZE)
+
+#define ACE_MASK_UNDEFINED 0x80000000
+
+#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+ ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+ ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+ ace_unused,
+ ace_user_obj,
+ ace_user,
+ ace_group, /* includes GROUP and GROUP_OBJ */
+ ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+ uid_t key;
+ avl_node_t avl;
+ uint32_t mask;
+ uint32_t allowed;
+ uint32_t denied;
+ int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+ acevals_t user_obj;
+ avl_tree_t user;
+ int numusers;
+ acevals_t group_obj;
+ avl_tree_t group;
+ int numgroups;
+ acevals_t other_obj;
+ uint32_t acl_mask;
+ int hasmask;
+ int dfacl_flag;
+ ace_to_aent_state_t state;
+ int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
ace_t trivial_acl[] = {
- {-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
+ {(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
+ {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
- {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE},
- {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
+ {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
+ ACE_ACCESS_DENIED_ACE_TYPE},
+ {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
+ ACE_ACCESS_ALLOWED_ACE_TYPE},
+ {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
- {-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ {(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
};
void
-adjust_ace_pair(ace_t *pair, mode_t mode)
+adjust_ace_pair_common(void *pair, size_t access_off,
+ size_t pairsize, mode_t mode)
{
+ char *datap = (char *)pair;
+ uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
+ uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
+ access_off);
if (mode & S_IROTH)
- pair[1].a_access_mask |= ACE_READ_DATA;
+ *amask1 |= ACE_READ_DATA;
else
- pair[0].a_access_mask |= ACE_READ_DATA;
+ *amask0 |= ACE_READ_DATA;
if (mode & S_IWOTH)
- pair[1].a_access_mask |=
- ACE_WRITE_DATA|ACE_APPEND_DATA;
+ *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
else
- pair[0].a_access_mask |=
- ACE_WRITE_DATA|ACE_APPEND_DATA;
+ *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
if (mode & S_IXOTH)
- pair[1].a_access_mask |= ACE_EXECUTE;
+ *amask1 |= ACE_EXECUTE;
else
- pair[0].a_access_mask |= ACE_EXECUTE;
+ *amask0 |= ACE_EXECUTE;
+}
+
+void
+adjust_ace_pair(ace_t *pair, mode_t mode)
+{
+ adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask),
+ sizeof (ace_t), mode);
+}
+
+static void
+ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny)
+{
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE)
+ *allow = B_TRUE;
+ else if (type == ACE_ACCESS_DENIED_ACE_TYPE)
+ *deny = B_TRUE;
}
/*
* ace_trivial:
* determine whether an ace_t acl is trivial
*
- * Trivialness implys that the acl is composed of only
+ * Trivialness implies that the acl is composed of only
* owner, group, everyone entries. ACL can't
* have read_acl denied, and write_owner/write_acl/write_attributes
* can only be owner@ entry.
*/
int
-ace_trivial(ace_t *acep, int aclcnt)
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
{
- int i;
- int owner_seen = 0;
- int group_seen = 0;
- int everyone_seen = 0;
+ boolean_t owner_allow = B_FALSE;
+ boolean_t group_allow = B_FALSE;
+ boolean_t everyone_allow = B_FALSE;
+ boolean_t owner_deny = B_FALSE;
+ boolean_t group_deny = B_FALSE;
+ boolean_t everyone_deny = B_FALSE;
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
- for (i = 0; i != aclcnt; i++) {
- switch (acep[i].a_flags & 0xf040) {
+ while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
+ switch (flags & ACE_TYPE_FLAGS) {
case ACE_OWNER:
- if (group_seen || everyone_seen)
+ if (group_allow || group_deny || everyone_allow ||
+ everyone_deny)
return (1);
- owner_seen++;
+ ace_allow_deny_helper(type, &owner_allow, &owner_deny);
break;
case ACE_GROUP|ACE_IDENTIFIER_GROUP:
- if (everyone_seen || owner_seen == 0)
+ if (everyone_allow || everyone_deny &&
+ (!owner_allow && !owner_deny))
return (1);
- group_seen++;
+ ace_allow_deny_helper(type, &group_allow, &group_deny);
break;
case ACE_EVERYONE:
- if (owner_seen == 0 || group_seen == 0)
+ if (!owner_allow && !owner_deny &&
+ !group_allow && !group_deny)
return (1);
- everyone_seen++;
+ ace_allow_deny_helper(type,
+ &everyone_allow, &everyone_deny);
break;
default:
return (1);
}
- if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+ if (flags & (ACE_FILE_INHERIT_ACE|
ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
ACE_INHERIT_ONLY_ACE))
return (1);
@@ -124,27 +268,49 @@ ace_trivial(ace_t *acep, int aclcnt)
* Don't allow anybody to deny reading basic
* attributes or a files ACL.
*/
- if ((acep[i].a_access_mask &
- (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE))
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
return (1);
/*
* Allow on owner@ to allow
* write_acl/write_owner/write_attributes
*/
- if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
- (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask &
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
(ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
return (1);
+
}
- if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0))
- return (1);
+ if (!owner_allow || !owner_deny || !group_allow || !group_deny ||
+ !everyone_allow || !everyone_deny)
+ return (1);
return (0);
}
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+ uint16_t *type, uint32_t *mask)
+{
+ ace_t *acep = datap;
+
+ if (cookie >= aclcnt)
+ return (0);
+
+ *flags = acep[cookie].a_flags;
+ *type = acep[cookie].a_type;
+ *mask = acep[cookie++].a_access_mask;
+
+ return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
/*
* Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
@@ -171,8 +337,8 @@ ksort(caddr_t v, int n, int s, int (*f)())
for (g = n / 2; g > 0; g /= 2) {
for (i = g; i < n; i++) {
for (j = i - g; j >= 0 &&
- (*f)(v + j * s, v + (j + g) * s) == 1;
- j -= g) {
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
p1 = (void *)(v + j * s);
p2 = (void *)(v + (j + g) * s);
for (ii = 0; ii < s / 4; ii++) {
@@ -215,3 +381,1347 @@ cmp2acls(void *a, void *b)
/* Totally equal */
return (0);
}
+
+/*ARGSUSED*/
+static void *
+cacl_realloc(void *ptr, size_t size, size_t new_size)
+{
+#if defined(_KERNEL)
+ void *tmp;
+
+ tmp = kmem_alloc(new_size, KM_SLEEP);
+ (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size);
+ kmem_free(ptr, size);
+ return (tmp);
+#else
+ return (realloc(ptr, new_size));
+#endif
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+#if defined(_KERNEL)
+ *ptr = kmem_zalloc(size, KM_SLEEP);
+ return (0);
+#else
+ *ptr = calloc(1, size);
+ if (*ptr == NULL)
+ return (errno);
+
+ return (0);
+#endif
+}
+
+/*ARGSUSED*/
+static void
+cacl_free(void *ptr, size_t size)
+{
+#if defined(_KERNEL)
+ kmem_free(ptr, size);
+#else
+ free(ptr);
+#endif
+}
+
+acl_t *
+acl_alloc(enum acl_type type)
+{
+ acl_t *aclp;
+
+ if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+ return (NULL);
+
+ aclp->acl_aclp = NULL;
+ aclp->acl_cnt = 0;
+
+ switch (type) {
+ case ACE_T:
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ break;
+ case ACLENT_T:
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ break;
+ default:
+ acl_free(aclp);
+ aclp = NULL;
+ }
+ return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+ int acl_size;
+
+ if (aclp == NULL)
+ return;
+
+ if (aclp->acl_aclp) {
+ acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+ cacl_free(aclp->acl_aclp, acl_size);
+ }
+
+ cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+ uint32_t access_mask = 0;
+ int acl_produce;
+ int synchronize_set = 0, write_owner_set = 0;
+ int delete_set = 0, write_attrs_set = 0;
+ int read_named_set = 0, write_named_set = 0;
+
+ acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+ if (isallow) {
+ synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+ write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+ delete_set = ACL_DELETE_SET_ALLOW;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ } else {
+
+ synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+ write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+ delete_set = ACL_DELETE_SET_DENY;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ else
+ /*
+ * If the entity is not the owner and does not
+ * have write permissions ACE_WRITE_ATTRIBUTES will
+ * always go in the DENY ACE.
+ */
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ }
+
+ if (acl_produce & synchronize_set)
+ access_mask |= ACE_SYNCHRONIZE;
+ if (acl_produce & write_owner_set)
+ access_mask |= ACE_WRITE_OWNER;
+ if (acl_produce & delete_set)
+ access_mask |= ACE_DELETE;
+ if (acl_produce & write_attrs_set)
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ if (acl_produce & read_named_set)
+ access_mask |= ACE_READ_NAMED_ATTRS;
+ if (acl_produce & write_named_set)
+ access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+ return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow)
+{
+ uint32_t access = 0;
+ int haswriteperm = 0;
+ int hasreadperm = 0;
+
+ if (isallow) {
+ haswriteperm = (mode & S_IWOTH);
+ hasreadperm = (mode & S_IROTH);
+ } else {
+ haswriteperm = !(mode & S_IWOTH);
+ hasreadperm = !(mode & S_IROTH);
+ }
+
+ /*
+ * The following call takes care of correctly setting the following
+ * mask bits in the access_mask:
+ * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+ * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+ */
+ access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+ if (isallow) {
+ access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+ if (isowner)
+ access |= ACE_WRITE_ACL;
+ } else {
+ if (! isowner)
+ access |= ACE_WRITE_ACL;
+ }
+
+ /* read */
+ if (mode & S_IROTH) {
+ access |= ACE_READ_DATA;
+ }
+ /* write */
+ if (mode & S_IWOTH) {
+ access |= ACE_WRITE_DATA |
+ ACE_APPEND_DATA;
+ if (isdir)
+ access |= ACE_DELETE_CHILD;
+ }
+ /* exec */
+ if (mode & 01) {
+ access |= ACE_EXECUTE;
+ }
+
+ return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+ (void) memcpy(deny, allow, sizeof (ace_t));
+
+ deny->a_who = allow->a_who;
+
+ deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+ deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+ ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_WRITE_NAMED_ATTRS);
+ deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+ ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+ B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's. Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+ int *hasmask, mode_t *mask,
+ int *numuser, int *numgroup, int *needsort)
+{
+ int error = 0;
+ int i;
+ int curtype = 0;
+
+ *hasmask = 0;
+ *mask = 07;
+ *needsort = 0;
+ *numuser = 0;
+ *numgroup = 0;
+
+ for (i = 0; i < n; i++) {
+ if (aclent[i].a_type < curtype)
+ *needsort = 1;
+ else if (aclent[i].a_type > curtype)
+ curtype = aclent[i].a_type;
+ if (aclent[i].a_type & USER)
+ (*numuser)++;
+ if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+ (*numgroup)++;
+ if (aclent[i].a_type & CLASS_OBJ) {
+ if (*hasmask) {
+ error = EINVAL;
+ goto out;
+ } else {
+ *hasmask = 1;
+ *mask = aclent[i].a_perm;
+ }
+ }
+ }
+
+ if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+ int error = 0;
+ mode_t mask;
+ int numuser, numgroup, needsort;
+ int resultsize = 0;
+ int i, groupi = 0, skip;
+ ace_t *acep, *result = NULL;
+ int hasmask;
+
+ error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+ &numuser, &numgroup, &needsort);
+ if (error != 0)
+ goto out;
+
+ /* allow + deny for each aclent */
+ resultsize = n * 2;
+ if (hasmask) {
+ /*
+ * stick extra deny on the group_obj and on each
+ * user|group for the mask (the group_obj was added
+ * into the count for numgroup)
+ */
+ resultsize += numuser + numgroup;
+ /* ... and don't count the mask itself */
+ resultsize -= 2;
+ }
+
+ /* sort the source if necessary */
+ if (needsort)
+ ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+ if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+ goto out;
+
+ acep = result;
+
+ for (i = 0; i < n; i++) {
+ /*
+ * don't process CLASS_OBJ (mask); mask was grabbed in
+ * ln_aent_preprocess()
+ */
+ if (aclent[i].a_type & CLASS_OBJ)
+ continue;
+
+ /* If we need an ACL_MASK emulator, prepend it now */
+ if ((hasmask) &&
+ (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+ acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ acep->a_flags = 0;
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |=
+ (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ }
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+ /*
+ * Set the access mask for the prepended deny
+ * ace. To do this, we invert the mask (found
+ * in ln_aent_preprocess()) then convert it to an
+ * DENY ace access_mask.
+ */
+ acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+ isdir, 0, 0);
+ acep += 1;
+ }
+
+ /* handle a_perm -> access_mask */
+ acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+ isdir, aclent[i].a_type & USER_OBJ, 1);
+
+ /* emulate a default aclent */
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+
+ /*
+ * handle a_perm and a_id
+ *
+ * this must be done last, since it involves the
+ * corresponding deny aces, which are handled
+ * differently for each different a_type.
+ */
+ if (aclent[i].a_type & USER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_OWNER;
+ ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+ acep += 2;
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_GROUP;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ }
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ /*
+ * Set the corresponding deny for the group ace.
+ *
+ * The deny aces go after all of the groups, unlike
+ * everything else, where they immediately follow
+ * the allow ace.
+ *
+ * We calculate "skip", the number of slots to
+ * skip ahead for the deny ace, here.
+ *
+ * The pattern is:
+ * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+ * thus, skip is
+ * (2 * numgroup) - 1 - groupi
+ * (2 * numgroup) to account for MD + A
+ * - 1 to account for the fact that we're on the
+ * access (A), not the mask (MD)
+ * - groupi to account for the fact that we have
+ * passed up groupi number of MD's.
+ */
+ skip = (2 * numgroup) - 1 - groupi;
+ ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+ /*
+ * If we just did the last group, skip acep past
+ * all of the denies; else, just move ahead one.
+ */
+ if (++groupi >= numgroup)
+ acep += numgroup + 1;
+ else
+ acep += 1;
+ } else if (aclent[i].a_type & OTHER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_EVERYONE;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ *acepp = result;
+ *rescount = resultsize;
+
+out:
+ if (error != 0) {
+ if ((result != NULL) && (resultsize > 0)) {
+ cacl_free(result, resultsize * sizeof (ace_t));
+ }
+ }
+
+ return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir,
+ ace_t **retacep, int *retacecnt)
+{
+ ace_t *acep;
+ ace_t *dfacep;
+ int acecnt = 0;
+ int dfacecnt = 0;
+ int dfaclstart = 0;
+ int dfaclcnt = 0;
+ aclent_t *aclp;
+ int i;
+ int error;
+ int acesz, dfacesz;
+
+ ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+ for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ if (i < aclcnt) {
+ dfaclstart = i;
+ dfaclcnt = aclcnt - i;
+ }
+
+ if (dfaclcnt && isdir == 0) {
+ return (EINVAL);
+ }
+
+ error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
+ if (error)
+ return (error);
+
+ if (dfaclcnt) {
+ error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+ &dfacep, &dfacecnt, isdir);
+ if (error) {
+ if (acep) {
+ cacl_free(acep, acecnt * sizeof (ace_t));
+ }
+ return (error);
+ }
+ }
+
+ if (dfacecnt != 0) {
+ acesz = sizeof (ace_t) * acecnt;
+ dfacesz = sizeof (ace_t) * dfacecnt;
+ acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+ if (acep == NULL)
+ return (ENOMEM);
+ if (dfaclcnt) {
+ (void) memcpy(acep + acecnt, dfacep, dfacesz);
+ }
+ }
+ if (dfaclcnt)
+ cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+ *retacecnt = acecnt + dfacecnt;
+ *retacep = acep;
+ return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t mask, o_mode_t *modep, int isdir)
+{
+ int error = 0;
+ o_mode_t mode = 0;
+ uint32_t bits, wantbits;
+
+ /* read */
+ if (mask & ACE_READ_DATA)
+ mode |= S_IROTH;
+
+ /* write */
+ wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+ if (isdir)
+ wantbits |= ACE_DELETE_CHILD;
+ bits = mask & wantbits;
+ if (bits != 0) {
+ if (bits != wantbits) {
+ error = ENOTSUP;
+ goto out;
+ }
+ mode |= S_IWOTH;
+ }
+
+ /* exec */
+ if (mask & ACE_EXECUTE) {
+ mode |= S_IXOTH;
+ }
+
+ *modep = mode;
+
+out:
+ return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+ bzero(vals, sizeof (*vals));
+ vals->allowed = ACE_MASK_UNDEFINED;
+ vals->denied = ACE_MASK_UNDEFINED;
+ vals->mask = ACE_MASK_UNDEFINED;
+ vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+ acevals_init(&al->user_obj, 0);
+ acevals_init(&al->group_obj, 0);
+ acevals_init(&al->other_obj, 0);
+ al->numusers = 0;
+ al->numgroups = 0;
+ al->acl_mask = 0;
+ al->hasmask = 0;
+ al->state = ace_unused;
+ al->seen = 0;
+ al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+ acevals_t key, *rc;
+ avl_index_t where;
+
+ key.key = ace->a_who;
+ rc = avl_find(avl, &key, &where);
+ if (rc != NULL)
+ return (rc);
+
+ /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+ if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+ return (NULL);
+
+ acevals_init(rc, ace->a_who);
+ avl_insert(avl, rc, where);
+ (*num)++;
+
+ return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+ int set_deny, err_deny;
+ int set_allow, err_allow;
+ int acl_consume;
+ int haswriteperm, hasreadperm;
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+ } else {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+ }
+
+ acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+ ACL_DELETE_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY |
+ ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+ ACL_WRITE_NAMED_WRITER_ERR_DENY |
+ ACL_READ_NAMED_READER_ERR_DENY);
+
+ if (mask_bit == ACE_SYNCHRONIZE) {
+ set_deny = ACL_SYNCHRONIZE_SET_DENY;
+ err_deny = ACL_SYNCHRONIZE_ERR_DENY;
+ set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+ err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_OWNER) {
+ set_deny = ACL_WRITE_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+ } else if (mask_bit == ACE_DELETE) {
+ set_deny = ACL_DELETE_SET_DENY;
+ err_deny = ACL_DELETE_ERR_DENY;
+ set_allow = ACL_DELETE_SET_ALLOW;
+ err_allow = ACL_DELETE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+ if (isowner) {
+ set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+ } else if (haswriteperm) {
+ set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+ } else {
+ if ((acep->a_access_mask & mask_bit) &&
+ (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+ if (!hasreadperm)
+ return (0);
+
+ set_deny = ACL_READ_NAMED_READER_SET_DENY;
+ err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+ set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+ err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+ if (!haswriteperm)
+ return (0);
+
+ set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+ } else {
+ return (EINVAL);
+ }
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ if (acl_consume & set_deny) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_deny) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ } else {
+ /* ACE_ACCESS_ALLOWED_ACE_TYPE */
+ if (acl_consume & set_allow) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_allow) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+ int error = 0;
+ int isowner;
+
+ /* only ALLOW or DENY */
+ if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid flags */
+ if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* some flags are illegal */
+ if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+ ACE_FAILED_ACCESS_ACE_FLAG |
+ ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid masks */
+ if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ isowner = 1;
+ } else {
+ isowner = 0;
+ }
+
+ error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_DELETE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ /* more detailed checking of masks */
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+ (acep->a_access_mask & ACE_APPEND_DATA)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+ /* ACL enforcement */
+ if ((acep->a_access_mask & ACE_READ_ACL) &&
+ (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (acep->a_access_mask & ACE_WRITE_ACL) {
+ if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+ (isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (! isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+out:
+ return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir)
+{
+ /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+ (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+ return (ENOTSUP);
+ }
+
+ return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+ uid_t owner, gid_t group, int isdir)
+{
+ int error;
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ if (vals->allowed != (vals->denied ^ flips)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+ (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ dest->a_type = vals->aent_type;
+ if (dest->a_type & (USER | GROUP)) {
+ dest->a_id = vals->key;
+ } else if (dest->a_type & USER_OBJ) {
+ dest->a_id = owner;
+ } else if (dest->a_type & GROUP_OBJ) {
+ dest->a_id = group;
+ } else if (dest->a_type & OTHER_OBJ) {
+ dest->a_id = 0;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+ uid_t owner, gid_t group, int isdir)
+{
+ int error = 0;
+ aclent_t *aent, *result = NULL;
+ acevals_t *vals;
+ int resultcount;
+
+ if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+ (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ resultcount = 3 + list->numusers + list->numgroups;
+ /*
+ * This must be the same condition as below, when we add the CLASS_OBJ
+ * (aka ACL mask)
+ */
+ if ((list->hasmask) || (! list->dfacl_flag))
+ resultcount += 1;
+
+ if (cacl_malloc((void **)&result,
+ resultcount * sizeof (aclent_t)) != 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ aent = result;
+
+ /* USER_OBJ */
+ if (!(list->user_obj.aent_type & USER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+ isdir);
+
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* USER */
+ vals = NULL;
+ for (vals = avl_first(&list->user); vals != NULL;
+ vals = AVL_NEXT(&list->user, vals)) {
+ if (!(vals->aent_type & USER)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /* GROUP_OBJ */
+ if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* GROUP */
+ vals = NULL;
+ for (vals = avl_first(&list->group); vals != NULL;
+ vals = AVL_NEXT(&list->group, vals)) {
+ if (!(vals->aent_type & GROUP)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /*
+ * CLASS_OBJ (aka ACL_MASK)
+ *
+ * An ACL_MASK is not fabricated if the ACL is a default ACL.
+ * This is to follow UFS's behavior.
+ */
+ if ((list->hasmask) || (! list->dfacl_flag)) {
+ if (list->hasmask) {
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ error = ace_mask_to_mode(list->acl_mask ^ flips,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ } else {
+ /* fabricate the ACL_MASK from the group permissions */
+ error = ace_mask_to_mode(list->group_obj.allowed,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ }
+ aent->a_id = 0;
+ aent->a_type = CLASS_OBJ | list->dfacl_flag;
+ ++aent;
+ }
+ /* OTHER_OBJ */
+ if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+
+ *aclentp = result;
+ *aclcnt = resultcount;
+
+out:
+ if (error != 0) {
+ if (result != NULL)
+ cacl_free(result, resultcount * sizeof (aclent_t));
+ }
+
+ return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+ acevals_t *node;
+ void *cookie;
+
+ if (al == NULL)
+ return;
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+
+ avl_destroy(&al->user);
+ avl_destroy(&al->group);
+
+ /* free the container itself */
+ cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+ const acevals_t *a = va, *b = vb;
+
+ if (a->key == b->key)
+ return (0);
+
+ if (a->key > b->key)
+ return (1);
+
+ else
+ return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+ aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+ int isdir)
+{
+ int error = 0;
+ ace_t *acep;
+ uint32_t bits;
+ int i;
+ ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+ acevals_t *vals;
+
+ *aclentp = NULL;
+ *aclcnt = 0;
+ *dfaclentp = NULL;
+ *dfaclcnt = 0;
+
+ /* we need at least user_obj, group_obj, and other_obj */
+ if (n < 6) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (ace == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+
+ ace_list_init(normacl, 0);
+
+ error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ ace_list_init(dfacl, ACL_DEFAULT);
+
+ /* process every ace_t... */
+ for (i = 0; i < n; i++) {
+ acep = &ace[i];
+
+ /* rule out certain cases quickly */
+ error = ace_to_aent_legal(acep);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Turn off these bits in order to not have to worry about
+ * them when doing the checks for compliments.
+ */
+ acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+ ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+ ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+ /* see if this should be a regular or default acl */
+ bits = acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE);
+ if (bits != 0) {
+ /* all or nothing on these inherit bits */
+ if (bits != (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl = dfacl;
+ } else {
+ acl = normacl;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ if (acl->state > ace_user_obj) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user_obj;
+ acl->seen |= USER_OBJ;
+ vals = &acl->user_obj;
+ vals->aent_type = USER_OBJ | acl->dfacl_flag;
+ } else if ((acep->a_flags & ACE_EVERYONE)) {
+ acl->state = ace_other_obj;
+ acl->seen |= OTHER_OBJ;
+ vals = &acl->other_obj;
+ vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+ } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+ if (acl->state > ace_group) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_flags & ACE_GROUP)) {
+ acl->seen |= GROUP_OBJ;
+ vals = &acl->group_obj;
+ vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+ } else {
+ acl->seen |= GROUP;
+ vals = acevals_find(acep, &acl->group,
+ &acl->numgroups);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = GROUP | acl->dfacl_flag;
+ }
+ acl->state = ace_group;
+ } else {
+ if (acl->state > ace_user) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user;
+ acl->seen |= USER;
+ vals = acevals_find(acep, &acl->user,
+ &acl->numusers);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = USER | acl->dfacl_flag;
+ }
+
+ if (!(acl->state > ace_unused)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ /* no more than one allowed per aclent_t */
+ if (vals->allowed != ACE_MASK_UNDEFINED) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->allowed = acep->a_access_mask;
+ } else {
+ /*
+ * it's a DENY; if there was a previous DENY, it
+ * must have been an ACL_MASK.
+ */
+ if (vals->denied != ACE_MASK_UNDEFINED) {
+ /* ACL_MASK is for USER and GROUP only */
+ if ((acl->state != ace_user) &&
+ (acl->state != ace_group)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if (! acl->hasmask) {
+ acl->hasmask = 1;
+ acl->acl_mask = vals->denied;
+ /* check for mismatched ACL_MASK emulations */
+ } else if (acl->acl_mask != vals->denied) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->mask = vals->denied;
+ }
+ vals->denied = acep->a_access_mask;
+ }
+ }
+
+ /* done collating; produce the aclent_t lists */
+ if (normacl->state != ace_unused) {
+ error = ace_list_to_aent(normacl, aclentp, aclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+ if (dfacl->state != ace_unused) {
+ error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+out:
+ if (normacl != NULL)
+ ace_list_free(normacl);
+ if (dfacl != NULL)
+ ace_list_free(dfacl);
+
+ return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir,
+ uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+ int error = 0;
+ aclent_t *aclentp, *dfaclentp;
+ int aclcnt, dfaclcnt;
+ int aclsz, dfaclsz;
+
+ error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+ &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+ if (error)
+ return (error);
+
+
+ if (dfaclcnt != 0) {
+ /*
+ * Slap aclentp and dfaclentp into a single array.
+ */
+ aclsz = sizeof (aclent_t) * aclcnt;
+ dfaclsz = sizeof (aclent_t) * dfaclcnt;
+ aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+ if (aclentp != NULL) {
+ (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+ } else {
+ error = ENOMEM;
+ }
+ }
+
+ if (aclentp) {
+ *retaclentp = aclentp;
+ *retaclcnt = aclcnt + dfaclcnt;
+ }
+
+ if (dfaclentp)
+ cacl_free(dfaclentp, dfaclsz);
+
+ return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner,
+ gid_t group)
+{
+ int aclcnt;
+ void *acldata;
+ int error;
+
+ /*
+ * See if we need to translate
+ */
+ if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+ (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACLENT_T))
+ return (0);
+
+ if (target_flavor == -1) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (target_flavor == _ACL_ACE_ENABLED &&
+ aclp->acl_type == ACLENT_T) {
+ error = convert_aent_to_ace(aclp->acl_aclp,
+ aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+
+ } else if (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACE_T) {
+ error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+ isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+ } else {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * replace old acl with newly translated acl
+ */
+ cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+ aclp->acl_aclp = acldata;
+ aclp->acl_cnt = aclcnt;
+ if (target_flavor == _ACL_ACE_ENABLED) {
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ } else {
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ }
+ return (0);
+
+out:
+
+#if !defined(_KERNEL)
+ errno = error;
+ return (-1);
+#else
+ return (error);
+#endif
+}
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
index 2227ad77ea93..84bd04f52fd6 100644
--- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#ifndef _ACL_ACL_UTILS_H
-#define _ACL_ACL_UTILS_H
+#ifndef _ACL_COMMON_H
+#define _ACL_COMMON_H
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -34,7 +33,7 @@
#include <sys/acl.h>
#include <sys/stat.h>
-#ifdef __cplusplus
+#ifdef __cplusplus
extern "C" {
#endif
@@ -42,15 +41,21 @@ extern ace_t trivial_acl[6];
extern int acltrivial(const char *);
extern void adjust_ace_pair(ace_t *pair, mode_t mode);
+extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
extern int ace_trivial(ace_t *acep, int aclcnt);
+extern int ace_trivial_common(void *, int,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
+ uint32_t *mask));
+extern acl_t *acl_alloc(acl_type_t);
+extern void acl_free(acl_t *aclp);
+extern int acl_translate(acl_t *aclp, int target_flavor,
+ int isdir, uid_t owner, gid_t group);
void ksort(caddr_t v, int n, int s, int (*f)());
int cmp2acls(void *a, void *b);
-
-
-#ifdef __cplusplus
+#ifdef __cplusplus
}
#endif
-#endif /* _ACL_ACL_UTILS_H */
+#endif /* _ACL_COMMON_H */
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
index 2e62aa420d91..6851086c1f96 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,14 +18,13 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
- .ident "%Z%%M% %I% %E% SMI"
-
- .file "%M%"
+ .file "atomic.s"
#define _ASM
#include <sys/asm_linkage.h>
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
index bc7f22acc8d4..57f7d0a47b53 100644
--- a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
+++ b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,18 +18,54 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
- .ident "%Z%%M% %I% %E% SMI"
-
- .file "%M%"
+ .file "atomic.s"
#define _ASM
#include <sys/asm_linkage.h>
+ /*
+ * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_dec_64_nv.
+ */
+ ENTRY(atomic_dec_64)
+ ALTENTRY(atomic_dec_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi // %edi = target address
+ movl (%edi), %eax
+ movl 4(%edi), %edx // %edx:%eax = old value
+1:
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ not %ecx
+ not %ebx // %ecx:%ebx = -1
+ addl %eax, %ebx
+ adcl %edx, %ecx // add in the carry from inc
+ lock
+ cmpxchg8b (%edi) // try to stick it in
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx // return new value
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_dec_64_nv)
+ SET_SIZE(atomic_dec_64)
+
+ /*
+ * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_add_64_nv.
+ */
ENTRY(atomic_add_64)
ALTENTRY(atomic_add_64_nv)
pushl %edi
diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c
index 1fa2236607bf..01aa3cb2fa9d 100644
--- a/sys/cddl/contrib/opensolaris/common/avl/avl.c
+++ b/sys/cddl/contrib/opensolaris/common/avl/avl.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -808,6 +808,64 @@ avl_remove(avl_tree_t *tree, void *data)
} while (parent != NULL);
}
+#define AVL_REINSERT(tree, obj) \
+ avl_remove((tree), (obj)); \
+ avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) <= 0));
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) >= 0));
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
/*
* initialize a new AVL tree
*/
@@ -853,6 +911,12 @@ avl_numnodes(avl_tree_t *tree)
return (tree->avl_numnodes);
}
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes == 0);
+}
#define CHILDBIT (1L)
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
index d3d5bed525ab..9cdc534ffc1b 100644
--- a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,15 +34,17 @@
#if defined(_KERNEL) && !defined(_BOOT)
#include <sys/varargs.h>
+#include <sys/sunddi.h>
#else
#include <stdarg.h>
-#include <strings.h>
+#include <stdlib.h>
+#include <string.h>
#endif
#ifndef offsetof
-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
#endif
-
+#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++
/*
* nvpair.c - Provides kernel & userland interfaces for manipulating
@@ -201,7 +203,7 @@ nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
static void
nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
{
- bzero(priv, sizeof (priv));
+ bzero(priv, sizeof (nvpriv_t));
priv->nvp_nva = nva;
priv->nvp_stat = stat;
@@ -395,6 +397,9 @@ i_validate_type_nelem(data_type_t type, uint_t nelem)
case DATA_TYPE_STRING:
case DATA_TYPE_HRTIME:
case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
if (nelem != 1)
return (EINVAL);
break;
@@ -733,6 +738,11 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
case DATA_TYPE_UINT64:
value_sz = sizeof (uint64_t);
break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ value_sz = sizeof (double);
+ break;
+#endif
case DATA_TYPE_STRING:
if (data == NULL)
value_sz = 0;
@@ -1017,6 +1027,14 @@ nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
}
+#if !defined(_KERNEL)
+int
+nvlist_add_double(nvlist_t *nvl, const char *name, double val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
+}
+#endif
+
int
nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
{
@@ -1123,13 +1141,15 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
curr = NVPAIR2I_NVP(nvp);
/*
- * Ensure that nvp is an valid pointer.
+ * Ensure that nvp is a valid nvpair on this nvlist.
+ * NB: nvp_curr is used only as a hint so that we don't always
+ * have to walk the list to determine if nvp is still on the list.
*/
if (nvp == NULL)
curr = priv->nvp_list;
- else if (priv->nvp_curr == curr)
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
curr = curr->nvi_next;
- else if (nvlist_contains_nvp(nvl, nvp) == 0)
+ else
curr = NULL;
priv->nvp_curr = curr;
@@ -1149,6 +1169,27 @@ nvpair_type(nvpair_t *nvp)
return (NVP_TYPE(nvp));
}
+int
+nvpair_type_is_array(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+
+ if ((type == DATA_TYPE_BYTE_ARRAY) ||
+ (type == DATA_TYPE_UINT8_ARRAY) ||
+ (type == DATA_TYPE_INT16_ARRAY) ||
+ (type == DATA_TYPE_UINT16_ARRAY) ||
+ (type == DATA_TYPE_INT32_ARRAY) ||
+ (type == DATA_TYPE_UINT32_ARRAY) ||
+ (type == DATA_TYPE_INT64_ARRAY) ||
+ (type == DATA_TYPE_UINT64_ARRAY) ||
+ (type == DATA_TYPE_BOOLEAN_ARRAY) ||
+ (type == DATA_TYPE_STRING_ARRAY) ||
+ (type == DATA_TYPE_NVLIST_ARRAY))
+ return (1);
+ return (0);
+
+}
+
static int
nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
{
@@ -1176,6 +1217,9 @@ nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
case DATA_TYPE_INT64:
case DATA_TYPE_UINT64:
case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
if (data == NULL)
return (EINVAL);
bcopy(NVP_VALUE(nvp), data,
@@ -1312,6 +1356,14 @@ nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
}
+#if !defined(_KERNEL)
+int
+nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
int
nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
{
@@ -1446,6 +1498,9 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
case DATA_TYPE_HRTIME:
case DATA_TYPE_STRING:
case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
val = va_arg(ap, void *);
ret = nvlist_lookup_common(nvl, name, type, NULL, val);
break;
@@ -1479,6 +1534,224 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
return (ret);
}
+/*
+ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
+ * returns zero and a pointer to the matching nvpair is returned in '*ret'
+ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
+ * multiple levels of embedded nvlists, with 'sep' as the separator. As an
+ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
+ * "a.d[3].e[1]". This matches the C syntax for array embed (for convience,
+ * code also supports "a.d[3]e[1]" syntax).
+ *
+ * If 'ip' is non-NULL and the last name component is an array, return the
+ * value of the "...[index]" array index in *ip. For an array reference that
+ * is not indexed, *ip will be returned as -1. If there is a syntax error in
+ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
+ * inside the 'name' string where the syntax error was detected.
+ */
+static int
+nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
+ nvpair_t **ret, int *ip, char **ep)
+{
+ nvpair_t *nvp;
+ const char *np;
+ char *sepp;
+ char *idxp, *idxep;
+ nvlist_t **nva;
+ long idx;
+ int n;
+
+ if (ip)
+ *ip = -1; /* not indexed */
+ if (ep)
+ *ep = NULL;
+
+ if ((nvl == NULL) || (name == NULL))
+ return (EINVAL);
+
+ /* step through components of name */
+ for (np = name; np && *np; np = sepp) {
+ /* ensure unique names */
+ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
+ return (ENOTSUP);
+
+ /* skip white space */
+ skip_whitespace(np);
+ if (*np == 0)
+ break;
+
+ /* set 'sepp' to end of current component 'np' */
+ if (sep)
+ sepp = strchr(np, sep);
+ else
+ sepp = NULL;
+
+ /* find start of next "[ index ]..." */
+ idxp = strchr(np, '[');
+
+ /* if sepp comes first, set idxp to NULL */
+ if (sepp && idxp && (sepp < idxp))
+ idxp = NULL;
+
+ /*
+ * At this point 'idxp' is set if there is an index
+ * expected for the current component.
+ */
+ if (idxp) {
+ /* set 'n' to length of current 'np' name component */
+ n = idxp++ - np;
+
+ /* keep sepp up to date for *ep use as we advance */
+ skip_whitespace(idxp);
+ sepp = idxp;
+
+ /* determine the index value */
+#if defined(_KERNEL) && !defined(_BOOT)
+ if (ddi_strtol(idxp, &idxep, 0, &idx))
+ goto fail;
+#else
+ idx = strtol(idxp, &idxep, 0);
+#endif
+ if (idxep == idxp)
+ goto fail;
+
+ /* keep sepp up to date for *ep use as we advance */
+ sepp = idxep;
+
+ /* skip white space index value and check for ']' */
+ skip_whitespace(sepp);
+ if (*sepp++ != ']')
+ goto fail;
+
+ /* for embedded arrays, support C syntax: "a[1].b" */
+ skip_whitespace(sepp);
+ if (sep && (*sepp == sep))
+ sepp++;
+ } else if (sepp) {
+ n = sepp++ - np;
+ } else {
+ n = strlen(np);
+ }
+
+ /* trim trailing whitespace by reducing length of 'np' */
+ if (n == 0)
+ goto fail;
+ for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
+ ;
+ n++;
+
+ /* skip whitespace, and set sepp to NULL if complete */
+ if (sepp) {
+ skip_whitespace(sepp);
+ if (*sepp == 0)
+ sepp = NULL;
+ }
+
+ /*
+ * At this point:
+ * o 'n' is the length of current 'np' component.
+ * o 'idxp' is set if there was an index, and value 'idx'.
+ * o 'sepp' is set to the beginning of the next component,
+ * and set to NULL if we have no more components.
+ *
+ * Search for nvpair with matching component name.
+ */
+ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ /* continue if no match on name */
+ if (strncmp(np, nvpair_name(nvp), n) ||
+ (strlen(nvpair_name(nvp)) != n))
+ continue;
+
+ /* if indexed, verify type is array oriented */
+ if (idxp && !nvpair_type_is_array(nvp))
+ goto fail;
+
+ /*
+ * Full match found, return nvp and idx if this
+ * was the last component.
+ */
+ if (sepp == NULL) {
+ if (ret)
+ *ret = nvp;
+ if (ip && idxp)
+ *ip = (int)idx; /* return index */
+ return (0); /* found */
+ }
+
+ /*
+ * More components: current match must be
+ * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
+ * to support going deeper.
+ */
+ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
+ nvl = EMBEDDED_NVL(nvp);
+ break;
+ } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
+ (void) nvpair_value_nvlist_array(nvp,
+ &nva, (uint_t *)&n);
+ if ((n < 0) || (idx >= n))
+ goto fail;
+ nvl = nva[idx];
+ break;
+ }
+
+ /* type does not support more levels */
+ goto fail;
+ }
+ if (nvp == NULL)
+ goto fail; /* 'name' not found */
+
+ /* search for match of next component in embedded 'nvl' list */
+ }
+
+fail: if (ep && sepp)
+ *ep = sepp;
+ return (EINVAL);
+}
+
+/*
+ * Return pointer to nvpair with specified 'name'.
+ */
+int
+nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
+}
+
+/*
+ * Determine if named nvpair exists in nvlist (use embedded separator of '.'
+ * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed
+ * description.
+ */
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
+ const char *name, nvpair_t **ret, int *ip, char **ep)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
+}
+
+boolean_t
+nvlist_exists(nvlist_t *nvl, const char *name)
+{
+ nvpriv_t *priv;
+ nvpair_t *nvp;
+ i_nvp_t *curr;
+
+ if (name == NULL || nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_FALSE);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
int
nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
{
@@ -1539,6 +1812,14 @@ nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
}
+#if !defined(_KERNEL)
+int
+nvpair_value_double(nvpair_t *nvp, double *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
int
nvpair_value_string(nvpair_t *nvp, char **val)
{
@@ -2676,7 +2957,11 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
*/
ret = xdr_longlong_t(xdr, (void *)buf);
break;
-
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ ret = xdr_double(xdr, (void *)buf);
+ break;
+#endif
case DATA_TYPE_STRING:
ret = xdr_string(xdr, &buf, buflen - 1);
break;
@@ -2782,6 +3067,9 @@ nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
case DATA_TYPE_INT64:
case DATA_TYPE_UINT64:
case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
nvp_sz += 8;
break;
diff --git a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
new file mode 100644
index 000000000000..73cf74a4d159
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
@@ -0,0 +1,2130 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#else
+#include <strings.h>
+#endif /* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep.h>
+#include <sys/u8_textprep_data.h>
+
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define U8_MB_CUR_MAX (4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define U8_MAX_BYTES_UCS2 (3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define U8_STREAM_SAFE_TEXT_MAX (128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define U8_MAX_CHARS_A_SEQ (32)
+#define U8_UPPER_LIMIT_IN_A_SEQ (31)
+
+/* The combining class value for Starter. */
+#define U8_COMBINING_CLASS_STARTER (0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define U8_HANGUL_SYL_FIRST (0xAC00U)
+#define U8_HANGUL_SYL_LAST (0xD7A3U)
+
+#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
+#define U8_HANGUL_JAMO_L_LAST (0x1112U)
+#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
+#define U8_HANGUL_JAMO_V_LAST (0x1175U)
+#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
+#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
+
+#define U8_HANGUL_V_COUNT (21)
+#define U8_HANGUL_VT_COUNT (588)
+#define U8_HANGUL_T_COUNT (28)
+
+#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
+
+#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define U8_HANGUL_JAMO_L(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define U8_HANGUL_JAMO_V(u) \
+ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define U8_HANGUL_JAMO_T(u) \
+ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_JAMO(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_SYLLABLE(u) \
+ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
+ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define U8_DECOMP_BOTH (0xF5U)
+#define U8_DECOMP_CANONICAL (0xF6U)
+
+/* The indicator for 16-bit table. */
+#define U8_16BIT_TABLE_INDICATOR (0x8000U)
+
+/* The following are some convenience macros. */
+#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
+ (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
+ (uint32_t)(b3) & 0x3F;
+
+#define U8_SIMPLE_SWAP(a, b, t) \
+ (t) = (a); \
+ (a) = (b); \
+ (b) = (t);
+
+#define U8_ASCII_TOUPPER(c) \
+ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define U8_ASCII_TOLOWER(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define U8_SWAP_COMB_MARKS(a, b) \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8t[k] = u8s[start[(a)] + k]; \
+ for (k = 0; k < disp[(b)]; k++) \
+ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+ start[(b)] = start[(a)] + disp[(b)]; \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8s[start[(b)] + k] = u8t[k]; \
+ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+ U8_STATE_START = 0,
+ U8_STATE_HANGUL_L = 1,
+ U8_STATE_HANGUL_LV = 2,
+ U8_STATE_HANGUL_LVT = 3,
+ U8_STATE_HANGUL_V = 4,
+ U8_STATE_HANGUL_T = 5,
+ U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
+
+#define I_ U8_ILLEGAL_CHAR
+#define O_ U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
+ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
+ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef I_
+#undef O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* C8 C9 CA CB CC CD CE CF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D8 D9 DA DB DC DD DE DF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E8 E9 EA EB EC ED EE EF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* C8 C9 CA CB CC CD CE CF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D8 D9 DA DB DC DD DE DF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E8 E9 EA EB EC ED EE EF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t **p;
+ uchar_t *s1;
+ uchar_t *s2;
+ uchar_t f;
+ int sz;
+ size_t i;
+ int ret_val;
+ boolean_t second;
+ boolean_t no_need_to_validate_entire;
+ boolean_t check_additional;
+ boolean_t validate_ucs2_range_only;
+
+ if (! u8str)
+ return (0);
+
+ ib = (uchar_t *)u8str;
+ ibtail = ib + n;
+
+ ret_val = 0;
+
+ no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+ check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+ validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+ while (ib < ibtail) {
+ /*
+ * The first byte of a UTF-8 character tells how many
+ * bytes will follow for the character. If the first byte
+ * is an illegal byte value or out of range value, we just
+ * return -1 with an appropriate error number.
+ */
+ sz = u8_number_of_bytes[*ib];
+ if (sz == U8_ILLEGAL_CHAR) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+
+ if (sz == U8_OUT_OF_RANGE_CHAR ||
+ (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+ *errnum = ERANGE;
+ return (-1);
+ }
+
+ /*
+ * If we don't have enough bytes to check on, that's also
+ * an error. As you can see, we give illegal byte sequence
+ * checking higher priority then EINVAL cases.
+ */
+ if ((ibtail - ib) < sz) {
+ *errnum = EINVAL;
+ return (-1);
+ }
+
+ if (sz == 1) {
+ ib++;
+ ret_val++;
+ } else {
+ /*
+ * Check on the multi-byte UTF-8 character. For more
+ * details on this, see comment added for the used
+ * data structures at the beginning of the file.
+ */
+ f = *ib++;
+ ret_val++;
+ second = B_TRUE;
+ for (i = 1; i < sz; i++) {
+ if (second) {
+ if (*ib < u8_valid_min_2nd_byte[f] ||
+ *ib > u8_valid_max_2nd_byte[f]) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ second = B_FALSE;
+ } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ ib++;
+ ret_val++;
+ }
+ }
+
+ if (check_additional) {
+ for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+ s1 = ib - sz;
+ s2 = p[i];
+ while (s1 < ib) {
+ if (*s1 != *s2 || *s2 == '\0')
+ break;
+ s1++;
+ s2++;
+ }
+
+ if (s1 >= ib && *s2 == '\0') {
+ *errnum = EBADF;
+ return (-1);
+ }
+ }
+ }
+
+ if (no_need_to_validate_entire)
+ break;
+ }
+
+ return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+ size_t i;
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ /*
+ * At this point, the only possible values for sz are 2, 3, and 4.
+ * The u8s should point to a vector that is well beyond the size of
+ * 5 bytes.
+ */
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ } else if (sz == 3) {
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ } else {
+ /* This is not possible but just in case as a fallback. */
+ if (is_it_toupper)
+ *u8s = U8_ASCII_TOUPPER(*s);
+ else
+ *u8s = U8_ASCII_TOLOWER(*s);
+ u8s[1] = '\0';
+
+ return (1);
+ }
+ u8s[sz] = '\0';
+
+ /*
+ * Let's find out if we have a corresponding character.
+ */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_case_common_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ if (is_it_toupper) {
+ b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ /* Either there is no match or an error at the table. */
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+ } else {
+ b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+ }
+
+ /*
+ * If i is still zero, that means there is no corresponding character.
+ */
+ if (i == 0)
+ return ((size_t)sz);
+
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+ size_t n2, boolean_t is_it_toupper, int *errnum)
+{
+ int f;
+ int sz1;
+ int sz2;
+ size_t j;
+ size_t i1;
+ size_t i2;
+ uchar_t u8s1[U8_MB_CUR_MAX + 1];
+ uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+ i1 = i2 = 0;
+ while (i1 < n1 && i2 < n2) {
+ /*
+ * Find out what would be the byte length for this UTF-8
+ * character at string s1 and also find out if this is
+ * an illegal start byte or not and if so, issue a proper
+ * error number and yet treat this byte as a character.
+ */
+ sz1 = u8_number_of_bytes[*s1];
+ if (sz1 < 0) {
+ *errnum = EILSEQ;
+ sz1 = 1;
+ }
+
+ /*
+ * For 7-bit ASCII characters mainly, we do a quick case
+ * conversion right at here.
+ *
+ * If we don't have enough bytes for this character, issue
+ * an EINVAL error and use what are available.
+ *
+ * If we have enough bytes, find out if there is
+ * a corresponding uppercase character and if so, copy over
+ * the bytes for a comparison later. If there is no
+ * corresponding uppercase character, then, use what we have
+ * for the comparison.
+ */
+ if (sz1 == 1) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ s1++;
+ u8s1[1] = '\0';
+ } else if ((i1 + sz1) > n1) {
+ *errnum = EINVAL;
+ for (j = 0; (i1 + j) < n1; )
+ u8s1[j++] = *s1++;
+ u8s1[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+ s1 += sz1;
+ }
+
+ /* Do the same for the string s2. */
+ sz2 = u8_number_of_bytes[*s2];
+ if (sz2 < 0) {
+ *errnum = EILSEQ;
+ sz2 = 1;
+ }
+
+ if (sz2 == 1) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ s2++;
+ u8s2[1] = '\0';
+ } else if ((i2 + sz2) > n2) {
+ *errnum = EINVAL;
+ for (j = 0; (i2 + j) < n2; )
+ u8s2[j++] = *s2++;
+ u8s2[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+ s2 += sz2;
+ }
+
+ /* Now compare the two characters. */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ f = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (f != 0)
+ return (f);
+ }
+
+ /*
+ * They were the same. Let's move on to the next
+ * characters then.
+ */
+ i1 += sz1;
+ i2 += sz2;
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one of the two ends, that means the other string
+ * has something which then the fact can be used to determine
+ * the return value.
+ */
+ if (i1 >= n1) {
+ if (i2 >= n2)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b4 = 0;
+
+ if (sz == 1 || sz > 4)
+ return (0);
+
+ if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ }
+
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+ if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+ boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+ size_t i;
+ uint32_t u1;
+
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ u8s[2] = '\0';
+ } else if (sz == 3) {
+ /* Convert it to a Unicode scalar value. */
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+ /*
+ * If this is a Hangul syllable, we decompose it into
+ * a leading consonant, a vowel, and an optional trailing
+ * consonant and then return.
+ */
+ if (U8_HANGUL_SYLLABLE(u1)) {
+ u1 -= U8_HANGUL_SYL_FIRST;
+
+ b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+ b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+ / U8_HANGUL_T_COUNT;
+ b3 = u1 % U8_HANGUL_T_COUNT;
+
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+ if (b3) {
+ b3 += U8_HANGUL_JAMO_T_FIRST;
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+ u8s[9] = '\0';
+ *state = U8_STATE_HANGUL_LVT;
+ return (9);
+ }
+
+ u8s[6] = '\0';
+ *state = U8_STATE_HANGUL_LV;
+ return (6);
+ }
+
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ u8s[3] = '\0';
+
+ /*
+ * If this is a Hangul Jamo, we know there is nothing
+ * further that we can decompose.
+ */
+ if (U8_HANGUL_JAMO_L(u1)) {
+ *state = U8_STATE_HANGUL_L;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_V(u1)) {
+ if (*state == U8_STATE_HANGUL_L)
+ *state = U8_STATE_HANGUL_LV;
+ else
+ *state = U8_STATE_HANGUL_V;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_T(u1)) {
+ if (*state == U8_STATE_HANGUL_LV)
+ *state = U8_STATE_HANGUL_LVT;
+ else
+ *state = U8_STATE_HANGUL_T;
+ return (3);
+ }
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ u8s[4] = '\0';
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ u8s[0] = s[0];
+ u8s[1] = '\0';
+ *state = U8_STATE_START;
+ return (1);
+ }
+
+ /*
+ * At this point, this rountine does not know what it would get.
+ * The caller should sort it out if the state isn't a Hangul one.
+ */
+ *state = U8_STATE_START;
+
+ /* Try to find matching decomposition mapping byte sequence. */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_decomp_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ /*
+ * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+ * which is 0x8000, this means we couldn't fit the mappings into
+ * the cardinality of a unsigned byte.
+ */
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ /* This also means there wasn't any matching decomposition. */
+ if (start_id >= end_id)
+ return ((size_t)sz);
+
+ /*
+ * The final table for decomposition mappings has three types of
+ * byte sequences depending on whether a mapping is for compatibility
+ * decomposition, canonical decomposition, or both like the following:
+ *
+ * (1) Compatibility decomposition mappings:
+ *
+ * +---+---+-...-+---+
+ * | B0| B1| ... | Bm|
+ * +---+---+-...-+---+
+ *
+ * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+ *
+ * (2) Canonical decomposition mappings:
+ *
+ * +---+---+---+-...-+---+
+ * | T | b0| b1| ... | bn|
+ * +---+---+---+-...-+---+
+ *
+ * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+ *
+ * (3) Both mappings:
+ *
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ *
+ * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+ * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+ * compatibility mapping bytes.
+ *
+ * Note that compatibility decomposition means doing recursive
+ * decompositions using both compatibility decomposition mappings and
+ * canonical decomposition mappings. On the other hand, canonical
+ * decomposition means doing recursive decompositions using only
+ * canonical decomposition mappings. Since the table we have has gone
+ * through the recursions already, we do not need to do so during
+ * runtime, i.e., the table has been completely flattened out
+ * already.
+ */
+
+ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+ /* Get the type, T, of the byte sequence. */
+ b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+ /*
+ * If necessary, adjust start_id, end_id, or both. Note that if
+ * this is compatibility decomposition mapping, there is no
+ * adjustment.
+ */
+ if (canonical_decomposition) {
+ /* Is the mapping only for compatibility decomposition? */
+ if (b1 < U8_DECOMP_BOTH)
+ return ((size_t)sz);
+
+ start_id++;
+
+ if (b1 == U8_DECOMP_BOTH) {
+ end_id = start_id +
+ u8_decomp_final_tbl[uv][b3_base + start_id];
+ start_id++;
+ }
+ } else {
+ /*
+ * Unless this is a compatibility decomposition mapping,
+ * we adjust the start_id.
+ */
+ if (b1 == U8_DECOMP_BOTH) {
+ start_id++;
+ start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+ } else if (b1 == U8_DECOMP_CANONICAL) {
+ start_id++;
+ }
+ }
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ if (sz == 1) {
+ b4 = s[0];
+ } else if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ return (NULL);
+ }
+
+ b1 = u8_composition_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b2 = u8_composition_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ if (start_id >= end_id)
+ return (NULL);
+
+ b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+ return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+ uchar_t my_comb_class;
+ size_t i;
+
+ my_comb_class = comb_class[last];
+ for (i = 1; i < last; i++)
+ if (comb_class[i] >= my_comb_class ||
+ comb_class[i] == U8_COMBINING_CLASS_STARTER)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+ uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+ uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc[U8_MB_CUR_MAX];
+ uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+ size_t saved_marks_count;
+ uchar_t *p;
+ uchar_t *saved_p;
+ uchar_t *q;
+ size_t i;
+ size_t saved_i;
+ size_t j;
+ size_t k;
+ size_t l;
+ size_t C;
+ size_t saved_l;
+ size_t size;
+ uint32_t u1;
+ uint32_t u2;
+ boolean_t match_not_found = B_TRUE;
+
+ /*
+ * This should never happen unless the callers are doing some strange
+ * and unexpected things.
+ *
+ * The "last" is the index pointing to the last character not last + 1.
+ */
+ if (last >= U8_MAX_CHARS_A_SEQ)
+ last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+ for (i = l = 0; i <= last; i++) {
+ /*
+ * The last or any non-Starters at the beginning, we don't
+ * have any chance to do composition and so we just copy them
+ * to the temporary buffer.
+ */
+ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+ p = s + start[i];
+ size = disp[i];
+ for (k = 0; k < size; k++)
+ t[l++] = *p++;
+ continue;
+ }
+
+ /*
+ * If this could be a start of Hangul Jamos, then, we try to
+ * conjoin them.
+ */
+ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+ s[start[i] + 1], s[start[i] + 2]);
+ U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+ s[start[i] + 4], s[start[i] + 5]);
+
+ if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+ u1 -= U8_HANGUL_JAMO_L_FIRST;
+ u2 -= U8_HANGUL_JAMO_V_FIRST;
+ u1 = U8_HANGUL_SYL_FIRST +
+ (u1 * U8_HANGUL_V_COUNT + u2) *
+ U8_HANGUL_T_COUNT;
+
+ i += 2;
+ if (i <= last) {
+ U8_PUT_3BYTES_INTO_UTF32(u2,
+ s[start[i]], s[start[i] + 1],
+ s[start[i] + 2]);
+
+ if (U8_HANGUL_JAMO_T(u2)) {
+ u1 += u2 -
+ U8_HANGUL_JAMO_T_FIRST;
+ i++;
+ }
+ }
+
+ U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+ i--;
+ l += 3;
+ continue;
+ }
+ }
+
+ /*
+ * Let's then find out if this Starter has composition
+ * mapping.
+ */
+ p = find_composition_start(uv, s + start[i], disp[i]);
+ if (p == NULL)
+ goto SAVE_THE_CHAR;
+
+ /*
+ * We have a Starter with composition mapping and the next
+ * character is a non-Starter. Let's try to find out if
+ * we can do composition.
+ */
+
+ saved_p = p;
+ saved_i = i;
+ saved_l = l;
+ saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+ q = s + start[++i];
+ size = disp[i];
+
+ /*
+ * The next for() loop compares the non-Starter pointed by
+ * 'q' with the possible (joinable) characters pointed by 'p'.
+ *
+ * The composition final table entry pointed by the 'p'
+ * looks like the following:
+ *
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ *
+ * where C is the count byte indicating the number of
+ * mapping pairs where each pair would be look like
+ * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+ * character of a canonical decomposition and the B0-Bm are
+ * the bytes of a matching composite character. The F is
+ * a filler byte after each character as the separator.
+ */
+
+ match_not_found = B_TRUE;
+
+ for (C = *p++; C > 0; C--) {
+ for (k = 0; k < size; p++, k++)
+ if (*p != q[k])
+ break;
+
+ /* Have we found it? */
+ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ t[l++] = *p;
+
+ break;
+ }
+
+ /* We didn't find; skip to the next pair. */
+ if (*p != U8_TBL_ELEMENT_FILLER)
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ p++;
+ }
+
+ /*
+ * If there was no match, we will need to save the combining
+ * mark for later appending. After that, if the next one
+ * is a non-Starter and not blocked, then, we try once
+ * again to do composition with the next non-Starter.
+ *
+ * If there was no match and this was a Starter, then,
+ * this is a new start.
+ *
+ * If there was a match and a composition done and we have
+ * more to check on, then, we retrieve a new composition final
+ * table entry for the composite and then try to do the
+ * composition again.
+ */
+
+ if (match_not_found) {
+ if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+ i--;
+ goto SAVE_THE_CHAR;
+ }
+
+ saved_marks[saved_marks_count++] = i;
+ }
+
+ if (saved_l == l) {
+ while (i < last) {
+ if (blocked(comb_class, i + 1))
+ saved_marks[saved_marks_count++] = ++i;
+ else
+ break;
+ }
+ if (i < last) {
+ p = saved_p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ } else if (i < last) {
+ p = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (p != NULL) {
+ saved_p = p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ }
+
+ /*
+ * There is no more composition possible.
+ *
+ * If there was no composition what so ever then we copy
+ * over the original Starter and then append any non-Starters
+ * remaining at the target string sequentially after that.
+ */
+
+ if (saved_l == l) {
+ p = s + start[saved_i];
+ size = disp[saved_i];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+
+ for (k = 0; k < saved_marks_count; k++) {
+ p = s + start[saved_marks[k]];
+ size = disp[saved_marks[k]];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+ }
+
+ /*
+ * If the last character is a Starter and if we have a character
+ * (possibly another Starter) that can be turned into a composite,
+ * we do so and we do so until there is no more of composition
+ * possible.
+ */
+ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+ p = *os;
+ saved_l = l - disp[last];
+
+ while (p < oslast) {
+ size = u8_number_of_bytes[*p];
+ if (size <= 1 || (p + size) > oslast)
+ break;
+
+ saved_p = p;
+
+ for (i = 0; i < size; i++)
+ tc[i] = *p++;
+
+ q = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (q == NULL) {
+ p = saved_p;
+ break;
+ }
+
+ match_not_found = B_TRUE;
+
+ for (C = *q++; C > 0; C--) {
+ for (k = 0; k < size; q++, k++)
+ if (*q != tc[k])
+ break;
+
+ if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++q != U8_TBL_ELEMENT_FILLER) {
+ /*
+ * This is practically
+ * impossible but we don't
+ * want to take any chances.
+ */
+ if (l >=
+ U8_STREAM_SAFE_TEXT_MAX) {
+ p = saved_p;
+ goto SAFE_RETURN;
+ }
+ t[l++] = *q;
+ }
+
+ break;
+ }
+
+ if (*q != U8_TBL_ELEMENT_FILLER)
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ q++;
+ }
+
+ if (match_not_found) {
+ p = saved_p;
+ break;
+ }
+ }
+SAFE_RETURN:
+ *os = p;
+ }
+
+ /*
+ * Now we copy over the temporary string to the target string.
+ * Since composition always reduces the number of characters or
+ * the number of characters stay, we don't need to worry about
+ * the buffer overflow here.
+ */
+ for (i = 0; i < l; i++)
+ s[i] = t[i];
+ s[l] = '\0';
+
+ return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+ boolean_t is_it_toupper,
+ boolean_t is_it_tolower,
+ boolean_t canonical_decomposition,
+ boolean_t compatibility_decomposition,
+ boolean_t canonical_composition,
+ int *errnum, u8_normalization_states_t *state)
+{
+ uchar_t *s;
+ int sz;
+ int saved_sz;
+ size_t i;
+ size_t j;
+ size_t k;
+ size_t l;
+ uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+ uchar_t disp[U8_MAX_CHARS_A_SEQ];
+ uchar_t start[U8_MAX_CHARS_A_SEQ];
+ uchar_t u8t[U8_MB_CUR_MAX];
+ uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc;
+ size_t last;
+ size_t saved_last;
+ uint32_t u1;
+
+ /*
+ * Save the source string pointer which we will return a changed
+ * pointer if we do processing.
+ */
+ s = *source;
+
+ /*
+ * The following is a fallback for just in case callers are not
+ * checking the string boundaries before the calling.
+ */
+ if (s >= slast) {
+ u8s[0] = '\0';
+
+ return (0);
+ }
+
+ /*
+ * As the first thing, let's collect a character and do case
+ * conversion if necessary.
+ */
+
+ sz = u8_number_of_bytes[*s];
+
+ if (sz < 0) {
+ *errnum = EILSEQ;
+
+ u8s[0] = *s++;
+ u8s[1] = '\0';
+
+ *source = s;
+
+ return (1);
+ }
+
+ if (sz == 1) {
+ if (is_it_toupper)
+ u8s[0] = U8_ASCII_TOUPPER(*s);
+ else if (is_it_tolower)
+ u8s[0] = U8_ASCII_TOLOWER(*s);
+ else
+ u8s[0] = *s;
+ s++;
+ u8s[1] = '\0';
+ } else if ((s + sz) > slast) {
+ *errnum = EINVAL;
+
+ for (i = 0; s < slast; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+
+ *source = s;
+
+ return (i);
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+ s += sz;
+ sz = i;
+ } else {
+ for (i = 0; i < sz; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+ }
+ }
+
+ /*
+ * And then canonical/compatibility decomposition followed by
+ * an optional canonical composition. Please be noted that
+ * canonical composition is done only when a decomposition is
+ * done.
+ */
+ if (canonical_decomposition || compatibility_decomposition) {
+ if (sz == 1) {
+ *state = U8_STATE_START;
+
+ saved_sz = 1;
+
+ comb_class[0] = 0;
+ start[0] = 0;
+ disp[0] = 1;
+
+ last = 1;
+ } else {
+ saved_sz = do_decomp(uv, u8s, u8s, sz,
+ canonical_decomposition, state);
+
+ last = 0;
+
+ for (i = 0; i < saved_sz; ) {
+ sz = u8_number_of_bytes[u8s[i]];
+
+ comb_class[last] = combining_class(uv,
+ u8s + i, sz);
+ start[last] = i;
+ disp[last] = sz;
+
+ last++;
+ i += sz;
+ }
+
+ /*
+ * Decomposition yields various Hangul related
+ * states but not on combining marks. We need to
+ * find out at here by checking on the last
+ * character.
+ */
+ if (*state == U8_STATE_START) {
+ if (comb_class[last - 1])
+ *state = U8_STATE_COMBINING_MARK;
+ }
+ }
+
+ saved_last = last;
+
+ while (s < slast) {
+ sz = u8_number_of_bytes[*s];
+
+ /*
+ * If this is an illegal character, an incomplete
+ * character, or an 7-bit ASCII Starter character,
+ * then we have collected a sequence; break and let
+ * the next call deal with the two cases.
+ *
+ * Note that this is okay only if you are using this
+ * function with a fixed length string, not on
+ * a buffer with multiple calls of one chunk at a time.
+ */
+ if (sz <= 1) {
+ break;
+ } else if ((s + sz) > slast) {
+ break;
+ } else {
+ /*
+ * If the previous character was a Hangul Jamo
+ * and this character is a Hangul Jamo that
+ * can be conjoined, we collect the Jamo.
+ */
+ if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1,
+ *s, *(s + 1), *(s + 2));
+
+ if (U8_HANGUL_COMPOSABLE_L_V(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LV;
+ goto COLLECT_A_HANGUL;
+ }
+
+ if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LVT;
+ goto COLLECT_A_HANGUL;
+ }
+ }
+
+ /*
+ * Regardless of whatever it was, if this is
+ * a Starter, we don't collect the character
+ * since that's a new start and we will deal
+ * with it at the next time.
+ */
+ i = combining_class(uv, s, sz);
+ if (i == U8_COMBINING_CLASS_STARTER)
+ break;
+
+ /*
+ * We know the current character is a combining
+ * mark. If the previous character wasn't
+ * a Starter (not Hangul) or a combining mark,
+ * then, we don't collect this combining mark.
+ */
+ if (*state != U8_STATE_START &&
+ *state != U8_STATE_COMBINING_MARK)
+ break;
+
+ *state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+ /*
+ * If we collected a Starter and combining
+ * marks up to 30, i.e., total 31 characters,
+ * then, we terminate this degenerately long
+ * combining sequence with a U+034F COMBINING
+ * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+ * UTF-8 and turn this into a Stream-Safe
+ * Text. This will be extremely rare but
+ * possible.
+ *
+ * The following will also guarantee that
+ * we are not writing more than 32 characters
+ * plus a NULL at u8s[].
+ */
+ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+ *state = U8_STATE_START;
+ comb_class[last] = 0;
+ start[last] = saved_sz;
+ disp[last] = 2;
+ last++;
+
+ u8s[saved_sz++] = 0xCD;
+ u8s[saved_sz++] = 0x8F;
+
+ break;
+ }
+
+ /*
+ * Some combining marks also do decompose into
+ * another combining mark or marks.
+ */
+ if (*state == U8_STATE_COMBINING_MARK) {
+ k = last;
+ l = sz;
+ i = do_decomp(uv, uts, s, sz,
+ canonical_decomposition, state);
+ for (j = 0; j < i; ) {
+ sz = u8_number_of_bytes[uts[j]];
+
+ comb_class[last] =
+ combining_class(uv,
+ uts + j, sz);
+ start[last] = saved_sz + j;
+ disp[last] = sz;
+
+ last++;
+ if (last >=
+ U8_UPPER_LIMIT_IN_A_SEQ) {
+ last = k;
+ goto TURN_STREAM_SAFE;
+ }
+ j += sz;
+ }
+
+ *state = U8_STATE_COMBINING_MARK;
+ sz = i;
+ s += l;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = uts[i];
+ } else {
+ comb_class[last] = i;
+ start[last] = saved_sz;
+ disp[last] = sz;
+ last++;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = *s++;
+ }
+
+ /*
+ * If this is U+0345 COMBINING GREEK
+ * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+ * iota subscript, and need to be converted to
+ * uppercase letter, convert it to U+0399 GREEK
+ * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+ * i.e., convert to capital adscript form as
+ * specified in the Unicode standard.
+ *
+ * This is the only special case of (ambiguous)
+ * case conversion at combining marks and
+ * probably the standard will never have
+ * anything similar like this in future.
+ */
+ if (is_it_toupper && sz >= 2 &&
+ u8s[saved_sz - 2] == 0xCD &&
+ u8s[saved_sz - 1] == 0x85) {
+ u8s[saved_sz - 2] = 0xCE;
+ u8s[saved_sz - 1] = 0x99;
+ }
+ }
+ }
+
+ /*
+ * Let's try to ensure a canonical ordering for the collected
+ * combining marks. We do this only if we have collected
+ * at least one more non-Starter. (The decomposition mapping
+ * data tables have fully (and recursively) expanded and
+ * canonically ordered decompositions.)
+ *
+ * The U8_SWAP_COMB_MARKS() convenience macro has some
+ * assumptions and we are meeting the assumptions.
+ */
+ last--;
+ if (last >= saved_last) {
+ for (i = 0; i < last; i++)
+ for (j = last; j > i; j--)
+ if (comb_class[j] &&
+ comb_class[j - 1] > comb_class[j]) {
+ U8_SWAP_COMB_MARKS(j - 1, j);
+ }
+ }
+
+ *source = s;
+
+ if (! canonical_composition) {
+ u8s[saved_sz] = '\0';
+ return (saved_sz);
+ }
+
+ /*
+ * Now do the canonical composition. Note that we do this
+ * only after a canonical or compatibility decomposition to
+ * finish up NFC or NFKC.
+ */
+ sz = do_composition(uv, u8s, comb_class, start, disp, last,
+ &s, slast);
+ }
+
+ *source = s;
+
+ return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparion based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+ int flag, int *errnum)
+{
+ int result;
+ size_t sz1;
+ size_t sz2;
+ uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t *s1last;
+ uchar_t *s2last;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ u8_normalization_states_t state;
+
+ s1last = s1 + n1;
+ s2last = s2 + n2;
+
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (s1 < s1last && s2 < s2last) {
+ /*
+ * If the current character is a 7-bit ASCII and the last
+ * character, or, if the current character and the next
+ * character are both some 7-bit ASCII characters then
+ * we treat the current character as a sequence.
+ *
+ * In any other cases, we need to call collect_a_seq().
+ */
+
+ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+ ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else if (is_it_tolower)
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ else
+ u8s1[0] = *s1;
+ u8s1[1] = '\0';
+ sz1 = 1;
+ s1++;
+ } else {
+ state = U8_STATE_START;
+ sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+ ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else if (is_it_tolower)
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ else
+ u8s2[0] = *s2;
+ u8s2[1] = '\0';
+ sz2 = 1;
+ s2++;
+ } else {
+ state = U8_STATE_START;
+ sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ /*
+ * Now compare the two characters. If they are the same,
+ * we move on to the next character sequences.
+ */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ result = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (result != 0)
+ return (result);
+ }
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one end, that means the other string has
+ * something which then can be used to determine the return value.
+ */
+ if (s1 >= s1last) {
+ if (s2 >= s2last)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+ int *errnum)
+{
+ int f;
+ size_t n1;
+ size_t n2;
+
+ *errnum = 0;
+
+ /*
+ * Check on the requested Unicode version, case conversion, and
+ * normalization flag values.
+ */
+
+ if (uv > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ uv = U8_UNICODE_LATEST;
+ }
+
+ if (flag == 0) {
+ flag = U8_STRCMP_CS;
+ } else {
+ f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+ U8_STRCMP_CI_LOWER);
+ if (f == 0) {
+ flag |= U8_STRCMP_CS;
+ } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+ f != U8_STRCMP_CI_LOWER) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+ f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+ }
+
+ if (flag == U8_STRCMP_CS) {
+ return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+ }
+
+ n1 = strlen(s1);
+ n2 = strlen(s2);
+ if (n != 0) {
+ if (n < n1)
+ n1 = n;
+ if (n < n2)
+ n2 = n;
+ }
+
+ /*
+ * Simple case conversion can be done much faster and so we do
+ * them separately here.
+ */
+ if (flag == U8_STRCMP_CI_UPPER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_TRUE, errnum));
+ } else if (flag == U8_STRCMP_CI_LOWER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_FALSE, errnum));
+ }
+
+ return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+ flag, errnum));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+ int flag, size_t unicode_version, int *errnum)
+{
+ int f;
+ int sz;
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t *ob;
+ uchar_t *obtail;
+ boolean_t do_not_ignore_null;
+ boolean_t do_not_ignore_invalid;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ size_t ret_val;
+ size_t i;
+ size_t j;
+ uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+ u8_normalization_states_t state;
+
+ if (unicode_version > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+ if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+ f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ if (inarray == NULL || *inlen == 0)
+ return (0);
+
+ if (outarray == NULL) {
+ *errnum = E2BIG;
+ return ((size_t)-1);
+ }
+
+ ib = (uchar_t *)inarray;
+ ob = (uchar_t *)outarray;
+ ibtail = ib + *inlen;
+ obtail = ob + *outlen;
+
+ do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+ do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+ ret_val = 0;
+
+ /*
+ * If we don't have a normalization flag set, we do the simple case
+ * conversion based text preparation separately below. Text
+ * preparation involving Normalization will be done in the false task
+ * block, again, separately since it will take much more time and
+ * resource than doing simple case conversions.
+ */
+ if (f == 0) {
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ sz = u8_number_of_bytes[*ib];
+
+ if (sz < 0) {
+ if (do_not_ignore_invalid) {
+ *errnum = EILSEQ;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ sz = 1;
+ ret_val++;
+ }
+
+ if (sz == 1) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else if ((ib + sz) > ibtail) {
+ if (do_not_ignore_invalid) {
+ *errnum = EINVAL;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < (ibtail - ib)) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ /*
+ * We treat the remaining incomplete character
+ * bytes as a character.
+ */
+ ret_val++;
+
+ while (ib < ibtail)
+ *ob++ = *ib++;
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(unicode_version, u8s,
+ ib, sz, is_it_toupper);
+
+ if ((obtail - ob) < i) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ ib += sz;
+
+ for (sz = 0; sz < i; sz++)
+ *ob++ = u8s[sz];
+ } else {
+ if ((obtail - ob) < sz) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < sz; i++)
+ *ob++ = *ib++;
+ }
+ }
+ }
+ } else {
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ /*
+ * If the current character is a 7-bit ASCII
+ * character and it is the last character, or,
+ * if the current character is a 7-bit ASCII
+ * character and the next character is also a 7-bit
+ * ASCII character, then, we copy over this
+ * character without going through collect_a_seq().
+ *
+ * In any other cases, we need to look further with
+ * the collect_a_seq() function.
+ */
+ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+ ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else {
+ *errnum = 0;
+ state = U8_STATE_START;
+
+ j = collect_a_seq(unicode_version, u8s,
+ &ib, ibtail,
+ is_it_toupper,
+ is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition,
+ errnum, &state);
+
+ if (*errnum && do_not_ignore_invalid) {
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < j) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < j; i++)
+ *ob++ = u8s[i];
+ }
+ }
+ }
+
+ *inlen = ibtail - ib;
+ *outlen = obtail - ob;
+
+ return (ret_val);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
new file mode 100644
index 000000000000..74517a3f6920
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file is intended for functions that ought to be common between user
+ * land (libzfs) and the kernel. When many common routines need to be shared
+ * then a separate file should to be created.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/nvpair.h>
+
+/*
+ * Are there allocatable vdevs?
+ */
+boolean_t
+zfs_allocatable_devs(nvlist_t *nv)
+{
+ uint64_t is_log;
+ uint_t c;
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ return (B_FALSE);
+ }
+ for (c = 0; c < children; c++) {
+ is_log = 0;
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (!is_log)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
new file mode 100644
index 000000000000..f517044a80a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_COMUTIL_H
+#define _ZFS_COMUTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_COMUTIL_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
new file mode 100644
index 000000000000..0fd5800a84dc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
@@ -0,0 +1,234 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#else
+#include <stdio.h>
+#include <unistd.h>
+#include <strings.h>
+#include <libnvpair.h>
+#include <ctype.h>
+#endif
+/* XXX includes zfs_context.h, so why bother with the above? */
+#include <sys/dsl_deleg.h>
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+
+/*
+ * permission table
+ *
+ * Keep this table in sorted order
+ *
+ * This table is used for displaying all permissions for
+ * zfs allow
+ */
+
+zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
+ {ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW},
+ {ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+ {ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+ {ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+ {ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+ {ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+ {ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+ {ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+ {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+ {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+ {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+ {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
+ {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+ {NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+static int
+zfs_valid_permission_name(const char *perm)
+{
+ if (zfs_deleg_canonicalize_perm(perm))
+ return (0);
+
+ return (permset_namecheck(perm, NULL, NULL));
+}
+
+const char *
+zfs_deleg_canonicalize_perm(const char *perm)
+{
+ int i;
+ zfs_prop_t prop;
+
+ for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
+ if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
+ return (perm);
+ }
+
+ prop = zfs_name_to_prop(perm);
+ if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
+ return (zfs_prop_to_name(prop));
+ return (NULL);
+
+}
+
+static int
+zfs_validate_who(char *who)
+{
+ char *p;
+
+ if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
+ return (-1);
+
+ switch (who[0]) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ for (p = &who[3]; *p; p++)
+ if (!isdigit(*p))
+ return (-1);
+ break;
+
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ return (permset_namecheck(&who[3], NULL, NULL));
+
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+zfs_deleg_verify_nvlist(nvlist_t *nvp)
+{
+ nvpair_t *who, *perm_name;
+ nvlist_t *perms;
+ int error;
+
+ if (nvp == NULL)
+ return (-1);
+
+ who = nvlist_next_nvpair(nvp, NULL);
+ if (who == NULL)
+ return (-1);
+
+ do {
+ if (zfs_validate_who(nvpair_name(who)))
+ return (-1);
+
+ error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
+
+ if (error && error != ENOENT)
+ return (-1);
+ if (error == ENOENT)
+ continue;
+
+ perm_name = nvlist_next_nvpair(perms, NULL);
+ if (perm_name == NULL) {
+ return (-1);
+ }
+ do {
+ error = zfs_valid_permission_name(
+ nvpair_name(perm_name));
+ if (error)
+ return (-1);
+ } while (perm_name = nvlist_next_nvpair(perms, perm_name));
+ } while (who = nvlist_next_nvpair(nvp, who));
+ return (0);
+}
+
+/*
+ * Construct the base attribute name. The base attribute names
+ * are the "key" to locate the jump objects which contain the actual
+ * permissions. The base attribute names are encoded based on
+ * type of entry and whether it is a local or descendent permission.
+ *
+ * Arguments:
+ * attr - attribute name return string, attribute is assumed to be
+ * ZFS_MAX_DELEG_NAME long.
+ * type - type of entry to construct
+ * inheritchr - inheritance type (local,descendent, or NA for create and
+ * permission set definitions
+ * data - is either a permission set name or a 64 bit uid/gid.
+ */
+void
+zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char inheritchr, void *data)
+{
+ int len = ZFS_MAX_DELEG_NAME;
+ uint64_t *id = data;
+
+ switch (type) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
+ break;
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ (void) snprintf(attr, len, "%c-%c%s", type,
+ ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
+ break;
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ (void) snprintf(attr, len, "%c-%c", type,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ (void) snprintf(attr, len, "%c%c%c", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ default:
+ ASSERT(!"bad zfs_deleg_who_type_t");
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
new file mode 100644
index 000000000000..561b73e63df4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_DELEG_H
+#define _ZFS_DELEG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
+#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */
+
+/*
+ * Max name length for a delegation attribute
+ */
+#define ZFS_MAX_DELEG_NAME 128
+
+#define ZFS_DELEG_LOCAL 'l'
+#define ZFS_DELEG_DESCENDENT 'd'
+#define ZFS_DELEG_NA '-'
+
+typedef enum {
+ ZFS_DELEG_NOTE_CREATE,
+ ZFS_DELEG_NOTE_DESTROY,
+ ZFS_DELEG_NOTE_SNAPSHOT,
+ ZFS_DELEG_NOTE_ROLLBACK,
+ ZFS_DELEG_NOTE_CLONE,
+ ZFS_DELEG_NOTE_PROMOTE,
+ ZFS_DELEG_NOTE_RENAME,
+ ZFS_DELEG_NOTE_RECEIVE,
+ ZFS_DELEG_NOTE_ALLOW,
+ ZFS_DELEG_NOTE_USERPROP,
+ ZFS_DELEG_NOTE_MOUNT,
+ ZFS_DELEG_NOTE_SHARE,
+ ZFS_DELEG_NOTE_NONE
+} zfs_deleg_note_t;
+
+typedef struct zfs_deleg_perm_tab {
+ char *z_perm;
+ zfs_deleg_note_t z_note;
+} zfs_deleg_perm_tab_t;
+
+extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
+
+int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
+void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char checkflag, void *data);
+const char *zfs_deleg_canonicalize_perm(const char *perm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_DELEG_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
index 2004d860d329..a9d109be20ab 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,7 +44,9 @@
#endif
#include <sys/param.h>
+#include <sys/nvpair.h>
#include "zfs_namecheck.h"
+#include "zfs_deleg.h"
static int
valid_char(char c)
@@ -52,7 +54,7 @@ valid_char(char c)
return ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
- c == '-' || c == '_' || c == '.' || c == ':');
+ c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
}
/*
@@ -90,6 +92,32 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
return (0);
}
+
+/*
+ * Permissions set name must start with the letter '@' followed by the
+ * same character restrictions as snapshot names, except that the name
+ * cannot exceed 64 characters.
+ */
+int
+permset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] != '@') {
+ if (why) {
+ *why = NAME_ERR_NO_AT;
+ *what = path[0];
+ }
+ return (-1);
+ }
+
+ return (snapshot_namecheck(&path[1], why, what));
+}
+
/*
* Dataset names must be of the following form:
*
@@ -98,7 +126,10 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
* Where each component is made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.:%]
+ *
+ * We allow '%' here as we use that character internally to create unique
+ * names for temporary clones (for online recv).
*/
int
dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -114,6 +145,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
* If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
* places using MAXNAMELEN.
*/
+
if (strlen(path) >= MAXNAMELEN) {
if (why)
*why = NAME_ERR_TOOLONG;
@@ -167,7 +199,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
/* Validate the contents of this component */
while (loc != end) {
- if (!valid_char(*loc)) {
+ if (!valid_char(*loc) && *loc != '%') {
if (why) {
*why = NAME_ERR_INVALCHAR;
*what = *loc;
@@ -211,6 +243,50 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
}
}
+
+/*
+ * mountpoint names must be of the following form:
+ *
+ * /[component][/]*[component][/]
+ */
+int
+mountpoint_namecheck(const char *path, namecheck_err_t *why)
+{
+ const char *start, *end;
+
+ /*
+ * Make sure none of the mountpoint component names are too long.
+ * If a component name is too long then the mkdir of the mountpoint
+ * will fail but then the mountpoint property will be set to a value
+ * that can never be mounted. Better to fail before setting the prop.
+ * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
+ */
+
+ if (path == NULL || *path != '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ /* Skip leading slash */
+ start = &path[1];
+ do {
+ end = start;
+ while (*end != '/' && *end != '\0')
+ end++;
+
+ if (end - start >= MAXNAMELEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+ start = end + 1;
+
+ } while (*end != '\0');
+
+ return (0);
+}
+
/*
* For pool names, we have the same set of valid characters as described in
* dataset names, with the additional restriction that the pool name must begin
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
index 7e0cda974cc6..ec85e62f72e8 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,12 +42,17 @@ typedef enum {
NAME_ERR_RESERVED, /* entire name is reserved */
NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */
NAME_ERR_TOOLONG, /* name is too long */
+ NAME_ERR_NO_AT, /* permission set is missing '@' */
} namecheck_err_t;
+#define ZFS_PERMSET_MAXLEN 64
+
int pool_namecheck(const char *, namecheck_err_t *, char *);
int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int mountpoint_namecheck(const char *, namecheck_err_t *);
int dataset_name_hidden(const char *);
int snapshot_namecheck(const char *, namecheck_err_t *, char *);
+int permset_namecheck(const char *, namecheck_err_t *, char *);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 71256192c092..27a6f2e3dd00 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -19,40 +19,19 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Master property table.
- *
- * This table keeps track of all the properties supported by ZFS, and their
- * various attributes. Not all of these are needed by the kernel, and several
- * are only used by a single libzfs client. But having them here centralizes
- * all property information in one location.
- *
- * name The human-readable string representing this property
- * proptype Basic type (string, boolean, number)
- * default Default value for the property. Sadly, C only allows
- * you to initialize the first member of a union, so we
- * have two default members for each property.
- * attr Attributes (readonly, inheritable) for the property
- * types Valid dataset types to which this applies
- * values String describing acceptable values for the property
- * colname The column header for 'zfs list'
- * colfmt The column formatting for 'zfs list'
- *
- * This table must match the order of property types in libzfs.h.
- */
-
#include <sys/zio.h>
#include <sys/spa.h>
+#include <sys/u8_textprep.h>
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
#include "zfs_prop.h"
+#include "zfs_deleg.h"
#if defined(_KERNEL)
#include <sys/systm.h>
@@ -62,244 +41,283 @@
#include <ctype.h>
#endif
-typedef enum {
- prop_default,
- prop_readonly,
- prop_inherit
-} prop_attr_t;
-
-typedef struct {
- const char *pd_name;
- zfs_proptype_t pd_proptype;
- uint64_t pd_numdefault;
- const char *pd_strdefault;
- prop_attr_t pd_attr;
- int pd_types;
- const char *pd_values;
- const char *pd_colname;
- boolean_t pd_rightalign;
- boolean_t pd_visible;
-} prop_desc_t;
-
-static prop_desc_t zfs_prop_table[] = {
- { "type", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE,
- B_TRUE },
- { "creation", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE },
- { "used", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE },
- { "available", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE,
- B_TRUE },
- { "referenced", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY,
- "<size>", "REFER", B_TRUE, B_TRUE },
- { "compressratio", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE,
- B_TRUE },
- { "mounted", prop_type_boolean, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE },
- { "origin", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN",
- B_FALSE, B_TRUE },
- { "quota", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE },
- { "reservation", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "<size> | none", "RESERV", B_TRUE, B_TRUE },
- { "volsize", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE },
- { "volblocksize", prop_type_number, 8192, NULL, prop_readonly,
- ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE,
- B_TRUE },
- { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL,
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE },
- { "mountpoint", prop_type_string, 0, "/", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE },
- { "sharenfs", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE },
- { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE,
- B_TRUE },
- { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE },
- { "atime", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "ATIME", B_TRUE, B_TRUE },
- { "devices", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "DEVICES", B_TRUE, B_TRUE },
- { "exec", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "EXEC", B_TRUE, B_TRUE },
- { "setuid", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
- B_TRUE, B_TRUE },
- { "readonly", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off", "RDONLY", B_TRUE, B_TRUE },
- { "jailed", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "JAILED", B_TRUE, B_TRUE },
- { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden",
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE },
- { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE },
- { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE,
- B_TRUE },
- { "createtxg", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE },
- { "name", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE },
- { "canmount", prop_type_boolean, 1, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "CANMOUNT", B_TRUE, B_TRUE },
- { "shareiscsi", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_ANY,
- "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE },
- { "iscsioptions", prop_type_string, 0, NULL, prop_inherit,
- ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE },
- { "xattr", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "XATTR", B_TRUE, B_TRUE },
- { "numclones", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE },
- { "copies", prop_type_index, 1, "1", prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE },
- { "bootfs", prop_type_string, 0, NULL, prop_default,
- ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE },
-};
-
-#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t)))
-
-/*
- * Returns TRUE if the property applies to the given dataset types.
- */
-int
-zfs_prop_valid_for_type(zfs_prop_t prop, int types)
-{
- return ((zfs_prop_table[prop].pd_types & types) != 0);
-}
-
-/*
- * Determine if the specified property is visible or not.
- */
-boolean_t
-zfs_prop_is_visible(zfs_prop_t prop)
-{
- if (prop < 0)
- return (B_FALSE);
-
- return (zfs_prop_table[prop].pd_visible);
-}
-
-/*
- * Iterate over all properties, calling back into the specified function
- * for each property. We will continue to iterate until we either
- * reach the end or the callback function something other than
- * ZFS_PROP_CONT.
- */
-zfs_prop_t
-zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type,
- boolean_t show_all)
-{
- int i;
-
- for (i = 0; i < ZFS_PROP_COUNT; i++) {
- if (zfs_prop_valid_for_type(i, type) &&
- (zfs_prop_is_visible(i) || show_all)) {
- if (func(i, cb) != ZFS_PROP_CONT)
- return (i);
- }
- }
- return (ZFS_PROP_CONT);
-}
+static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
-zfs_prop_t
-zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all)
+zprop_desc_t *
+zfs_prop_get_table(void)
{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all));
+ return (zfs_prop_table);
}
-zpool_prop_t
-zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all)
+void
+zfs_prop_init(void)
{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all));
-}
+ static zprop_index_t checksum_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
+ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { NULL }
+ };
+
+ static zprop_index_t compress_table[] = {
+ { "on", ZIO_COMPRESS_ON },
+ { "off", ZIO_COMPRESS_OFF },
+ { "lzjb", ZIO_COMPRESS_LZJB },
+ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
+ { "gzip-1", ZIO_COMPRESS_GZIP_1 },
+ { "gzip-2", ZIO_COMPRESS_GZIP_2 },
+ { "gzip-3", ZIO_COMPRESS_GZIP_3 },
+ { "gzip-4", ZIO_COMPRESS_GZIP_4 },
+ { "gzip-5", ZIO_COMPRESS_GZIP_5 },
+ { "gzip-6", ZIO_COMPRESS_GZIP_6 },
+ { "gzip-7", ZIO_COMPRESS_GZIP_7 },
+ { "gzip-8", ZIO_COMPRESS_GZIP_8 },
+ { "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { NULL }
+ };
+
+ static zprop_index_t snapdir_table[] = {
+ { "hidden", ZFS_SNAPDIR_HIDDEN },
+ { "visible", ZFS_SNAPDIR_VISIBLE },
+ { NULL }
+ };
+
+ static zprop_index_t acl_mode_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "groupmask", ZFS_ACL_GROUPMASK },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { NULL }
+ };
+
+ static zprop_index_t acl_inherit_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "noallow", ZFS_ACL_NOALLOW },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
+ { NULL }
+ };
+
+ static zprop_index_t case_table[] = {
+ { "sensitive", ZFS_CASE_SENSITIVE },
+ { "insensitive", ZFS_CASE_INSENSITIVE },
+ { "mixed", ZFS_CASE_MIXED },
+ { NULL }
+ };
+
+ static zprop_index_t copies_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { NULL }
+ };
-zfs_proptype_t
-zfs_prop_get_type(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_proptype);
-}
-
-static boolean_t
-propname_match(const char *p, zfs_prop_t prop, size_t len)
-{
- const char *propname = zfs_prop_table[prop].pd_name;
-#ifndef _KERNEL
- const char *colname = zfs_prop_table[prop].pd_colname;
- int c;
-#endif
-
-#ifndef _KERNEL
- if (colname == NULL)
- return (B_FALSE);
-#endif
-
- if (len == strlen(propname) &&
- strncmp(p, propname, len) == 0)
- return (B_TRUE);
-
-#ifndef _KERNEL
- if (len != strlen(colname))
- return (B_FALSE);
-
- for (c = 0; c < len; c++)
- if (p[c] != tolower(colname[c]))
- break;
-
- return (colname[c] == '\0');
-#else
- return (B_FALSE);
-#endif
-}
-
-zfs_prop_t
-zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data)
-{
- const char *propname = cb_data;
-
- if (propname_match(propname, prop, strlen(propname)))
- return (prop);
-
- return (ZFS_PROP_CONT);
+ /*
+ * Use the unique flags we have to send to u8_strcmp() and/or
+ * u8_textprep() to represent the various normalization property
+ * values.
+ */
+ static zprop_index_t normalize_table[] = {
+ { "none", 0 },
+ { "formD", U8_TEXTPREP_NFD },
+ { "formKC", U8_TEXTPREP_NFKC },
+ { "formC", U8_TEXTPREP_NFC },
+ { "formKD", U8_TEXTPREP_NFKD },
+ { NULL }
+ };
+
+ static zprop_index_t version_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { "current", ZPL_VERSION },
+ { NULL }
+ };
+
+ static zprop_index_t boolean_table[] = {
+ { "off", 0 },
+ { "on", 1 },
+ { NULL }
+ };
+
+ static zprop_index_t canmount_table[] = {
+ { "off", ZFS_CANMOUNT_OFF },
+ { "on", ZFS_CANMOUNT_ON },
+ { "noauto", ZFS_CANMOUNT_NOAUTO },
+ { NULL }
+ };
+
+ static zprop_index_t cache_table[] = {
+ { "none", ZFS_CACHE_NONE },
+ { "metadata", ZFS_CACHE_METADATA },
+ { "all", ZFS_CACHE_ALL },
+ { NULL }
+ };
+
+ /* inherit index properties */
+ register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
+ checksum_table);
+ register_index(ZFS_PROP_COMPRESSION, "compression",
+ ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
+ register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "hidden | visible", "SNAPDIR", snapdir_table);
+ register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
+ register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | noallow | restricted | passthrough",
+ "ACLINHERIT", acl_inherit_table);
+ register_index(ZFS_PROP_COPIES, "copies", 1,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "1 | 2 | 3", "COPIES", copies_table);
+ register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "PRIMARYCACHE", cache_table);
+ register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "SECONDARYCACHE", cache_table);
+
+ /* inherit index (boolean) properties */
+ register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
+ register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
+ boolean_table);
+ register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
+ boolean_table);
+ register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+ boolean_table);
+ register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
+ boolean_table);
+ register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+ register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
+ boolean_table);
+ register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
+ boolean_table);
+ register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
+ boolean_table);
+
+ /* default index properties */
+ register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "1 | 2 | 3 | current", "VERSION", version_table);
+ register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
+ "CANMOUNT", canmount_table);
+
+ /* readonly index (boolean) properties */
+ register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+
+ /* set once index properties */
+ register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "none | formC | formD | formKC | formKD", "NORMALIZATION",
+ normalize_table);
+ register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "sensitive | insensitive | mixed", "CASE", case_table);
+
+ /* set once index (boolean) properties */
+ register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "UTF8ONLY", boolean_table);
+
+ /* string properties */
+ register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
+ register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
+ register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
+ register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
+ ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
+ register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+ ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
+ register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+
+ /* readonly number properties */
+ register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "USED");
+ register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
+ register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "REFER");
+ register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<1.00x or higher if compressed>", "RATIO");
+ register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
+ PROP_ONETIME,
+ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
+ register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
+ register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
+ register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
+ register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+ PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+
+ /* default number properties */
+ register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
+ register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
+ register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+ ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+ register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+ register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "REFRESERV");
+
+ /* inherit number properties */
+ register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
+ PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+
+ /* hidden properties */
+ register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
+ register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
+ register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
+ PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+ register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
+ ZFS_TYPE_DATASET, "GUID");
+
+ /* oddball properties */
+ register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
}
-/*
- * Given a property name and its type, returns the corresponding property ID.
- */
-zfs_prop_t
-zfs_name_to_prop_common(const char *propname, zfs_type_t type)
+boolean_t
+zfs_prop_delegatable(zfs_prop_t prop)
{
- zfs_prop_t prop;
-
- prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname,
- type, B_TRUE);
- return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop);
+ zprop_desc_t *pd = &zfs_prop_table[prop];
+ return (pd->pd_attr != PROP_READONLY);
}
/*
@@ -308,17 +326,9 @@ zfs_name_to_prop_common(const char *propname, zfs_type_t type)
zfs_prop_t
zfs_name_to_prop(const char *propname)
{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY));
+ return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
}
-/*
- * Given a pool property name, returns the corresponding property ID.
- */
-zpool_prop_t
-zpool_name_to_prop(const char *propname)
-{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL));
-}
/*
* For user property names, we allow all lowercase alphanumeric characters, plus
@@ -357,179 +367,85 @@ zfs_prop_user(const char *name)
}
/*
- * Return the default value for the given property.
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
*/
-const char *
-zfs_prop_default_string(zfs_prop_t prop)
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
{
- return (zfs_prop_table[prop].pd_strdefault);
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
}
-uint64_t
-zfs_prop_default_numeric(zfs_prop_t prop)
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
{
- return (zfs_prop_table[prop].pd_numdefault);
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
}
/*
- * Returns TRUE if the property is readonly.
+ * Returns TRUE if the property applies to any of the given dataset types.
*/
-int
-zfs_prop_readonly(zfs_prop_t prop)
+boolean_t
+zfs_prop_valid_for_type(int prop, zfs_type_t types)
{
- return (zfs_prop_table[prop].pd_attr == prop_readonly);
+ return (zprop_valid_for_type(prop, types));
}
-/*
- * Given a dataset property ID, returns the corresponding name.
- * Assuming the zfs dataset propety ID is valid.
- */
-const char *
-zfs_prop_to_name(zfs_prop_t prop)
+zprop_type_t
+zfs_prop_get_type(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_name);
+ return (zfs_prop_table[prop].pd_proptype);
}
/*
- * Given a pool property ID, returns the corresponding name.
- * Assuming the pool propety ID is valid.
+ * Returns TRUE if the property is readonly.
*/
-const char *
-zpool_prop_to_name(zpool_prop_t prop)
+boolean_t
+zfs_prop_readonly(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_name);
+ return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
/*
- * Returns TRUE if the property is inheritable.
+ * Returns TRUE if the property is only allowed to be set once.
*/
-int
-zfs_prop_inheritable(zfs_prop_t prop)
+boolean_t
+zfs_prop_setonce(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_attr == prop_inherit);
+ return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
-typedef struct zfs_index {
- const char *name;
- uint64_t index;
-} zfs_index_t;
-
-static zfs_index_t checksum_table[] = {
- { "on", ZIO_CHECKSUM_ON },
- { "off", ZIO_CHECKSUM_OFF },
- { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
- { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
- { "sha256", ZIO_CHECKSUM_SHA256 },
- { NULL }
-};
-
-static zfs_index_t compress_table[] = {
- { "on", ZIO_COMPRESS_ON },
- { "off", ZIO_COMPRESS_OFF },
- { "lzjb", ZIO_COMPRESS_LZJB },
- { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */
- { "gzip-1", ZIO_COMPRESS_GZIP_1 },
- { "gzip-2", ZIO_COMPRESS_GZIP_2 },
- { "gzip-3", ZIO_COMPRESS_GZIP_3 },
- { "gzip-4", ZIO_COMPRESS_GZIP_4 },
- { "gzip-5", ZIO_COMPRESS_GZIP_5 },
- { "gzip-6", ZIO_COMPRESS_GZIP_6 },
- { "gzip-7", ZIO_COMPRESS_GZIP_7 },
- { "gzip-8", ZIO_COMPRESS_GZIP_8 },
- { "gzip-9", ZIO_COMPRESS_GZIP_9 },
- { NULL }
-};
-
-static zfs_index_t snapdir_table[] = {
- { "hidden", ZFS_SNAPDIR_HIDDEN },
- { "visible", ZFS_SNAPDIR_VISIBLE },
- { NULL }
-};
-
-static zfs_index_t acl_mode_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "groupmask", ZFS_ACL_GROUPMASK },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t acl_inherit_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "noallow", ZFS_ACL_NOALLOW },
- { "secure", ZFS_ACL_SECURE },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t copies_table[] = {
- { "1", 1 },
- { "2", 2 },
- { "3", 3 },
- { NULL }
-};
-
-static zfs_index_t *
-zfs_prop_index_table(zfs_prop_t prop)
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
{
- switch (prop) {
- case ZFS_PROP_CHECKSUM:
- return (checksum_table);
- case ZFS_PROP_COMPRESSION:
- return (compress_table);
- case ZFS_PROP_SNAPDIR:
- return (snapdir_table);
- case ZFS_PROP_ACLMODE:
- return (acl_mode_table);
- case ZFS_PROP_ACLINHERIT:
- return (acl_inherit_table);
- case ZFS_PROP_COPIES:
- return (copies_table);
- default:
- return (NULL);
- }
+ return (zfs_prop_table[prop].pd_strdefault);
}
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_numdefault);
+}
/*
- * Tables of index types, plus functions to convert between the user view
- * (strings) and internal representation (uint64_t).
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset property ID is valid.
*/
-int
-zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (strcmp(string, table[i].name) == 0) {
- *index = table[i].index;
- return (0);
- }
- }
-
- return (-1);
+ return (zfs_prop_table[prop].pd_name);
}
-int
-zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+boolean_t
+zfs_prop_inheritable(zfs_prop_t prop)
{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (table[i].index == index) {
- *string = table[i].name;
- return (0);
- }
- }
-
- return (-1);
+ return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
#ifndef _KERNEL
@@ -541,22 +457,6 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
const char *
zfs_prop_values(zfs_prop_t prop)
{
- if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL)
- return (NULL);
-
- return (zfs_prop_table[prop].pd_values);
-}
-
-/*
- * Returns a string describing the set of acceptable values for the given
- * zpool property, or NULL if it cannot be set.
- */
-const char *
-zpool_prop_values(zfs_prop_t prop)
-{
- if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL)
- return (NULL);
-
return (zfs_prop_table[prop].pd_values);
}
@@ -568,8 +468,8 @@ zpool_prop_values(zfs_prop_t prop)
int
zfs_prop_is_string(zfs_prop_t prop)
{
- return (zfs_prop_table[prop].pd_proptype == prop_type_string ||
- zfs_prop_table[prop].pd_proptype == prop_type_index);
+ return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
+ zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
}
/*
@@ -592,66 +492,4 @@ zfs_prop_align_right(zfs_prop_t prop)
return (zfs_prop_table[prop].pd_rightalign);
}
-/*
- * Determines the minimum width for the column, and indicates whether it's fixed
- * or not. Only string columns are non-fixed.
- */
-size_t
-zfs_prop_width(zfs_prop_t prop, boolean_t *fixed)
-{
- prop_desc_t *pd = &zfs_prop_table[prop];
- zfs_index_t *idx;
- size_t ret;
- int i;
-
- *fixed = B_TRUE;
-
- /*
- * Start with the width of the column name.
- */
- ret = strlen(pd->pd_colname);
-
- /*
- * For fixed-width values, make sure the width is large enough to hold
- * any possible value.
- */
- switch (pd->pd_proptype) {
- case prop_type_number:
- /*
- * The maximum length of a human-readable number is 5 characters
- * ("20.4M", for example).
- */
- if (ret < 5)
- ret = 5;
- /*
- * 'creation' is handled specially because it's a number
- * internally, but displayed as a date string.
- */
- if (prop == ZFS_PROP_CREATION)
- *fixed = B_FALSE;
- break;
- case prop_type_boolean:
- /*
- * The maximum length of a boolean value is 3 characters, for
- * "off".
- */
- if (ret < 3)
- ret = 3;
- break;
- case prop_type_index:
- idx = zfs_prop_index_table(prop);
- for (i = 0; idx[i].name != NULL; i++) {
- if (strlen(idx[i].name) > ret)
- ret = strlen(idx[i].name);
- }
- break;
-
- case prop_type_string:
- *fixed = B_FALSE;
- break;
- }
-
- return (ret);
-}
-
#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
index 133e740ce6bc..da5ae43093e5 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -40,14 +40,87 @@ extern "C" {
* in the kernel, but the string value in userland.
*/
typedef enum {
- prop_type_number, /* numeric value */
- prop_type_string, /* string value */
- prop_type_boolean, /* boolean value */
- prop_type_index /* numeric value indexed by string */
-} zfs_proptype_t;
-
-zfs_proptype_t zfs_prop_get_type(zfs_prop_t);
-size_t zfs_prop_width(zfs_prop_t, boolean_t *);
+ PROP_TYPE_NUMBER, /* numeric value */
+ PROP_TYPE_STRING, /* string value */
+ PROP_TYPE_INDEX /* numeric value indexed by string */
+} zprop_type_t;
+
+typedef enum {
+ PROP_DEFAULT,
+ PROP_READONLY,
+ PROP_INHERIT,
+ /*
+ * ONETIME properties are a sort of conglomeration of READONLY
+ * and INHERIT. They can be set only during object creation,
+ * after that they are READONLY. If not explicitly set during
+ * creation, they can be inherited.
+ */
+ PROP_ONETIME
+} zprop_attr_t;
+
+typedef struct zfs_index {
+ const char *pi_name;
+ uint64_t pi_value;
+} zprop_index_t;
+
+typedef struct {
+ const char *pd_name; /* human-readable property name */
+ int pd_propnum; /* property number */
+ zprop_type_t pd_proptype; /* string, boolean, index, number */
+ const char *pd_strdefault; /* default for strings */
+ uint64_t pd_numdefault; /* for boolean / index / number */
+ zprop_attr_t pd_attr; /* default, readonly, inherit */
+ int pd_types; /* bitfield of valid dataset types */
+ /* fs | vol | snap; or pool */
+ const char *pd_values; /* string telling acceptable values */
+ const char *pd_colname; /* column header for "zfs list" */
+ boolean_t pd_rightalign; /* column alignment for "zfs list" */
+ boolean_t pd_visible; /* do we list this property with the */
+ /* "zfs get" help message */
+ const zprop_index_t *pd_table; /* for index properties, a table */
+ /* defining the possible values */
+} zprop_desc_t;
+
+/*
+ * zfs dataset property functions
+ */
+void zfs_prop_init(void);
+zprop_type_t zfs_prop_get_type(zfs_prop_t);
+boolean_t zfs_prop_delegatable(zfs_prop_t prop);
+zprop_desc_t *zfs_prop_get_table(void);
+
+/*
+ * zpool property functions
+ */
+void zpool_prop_init(void);
+zprop_type_t zpool_prop_get_type(zpool_prop_t);
+zprop_desc_t *zpool_prop_get_table(void);
+
+/*
+ * Common routines to initialize property tables
+ */
+void register_impl(int, const char *, zprop_type_t, uint64_t,
+ const char *, zprop_attr_t, int, const char *, const char *,
+ boolean_t, boolean_t, const zprop_index_t *);
+void register_string(int, const char *, const char *, zprop_attr_t attr,
+ int, const char *, const char *);
+void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *);
+void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *, const zprop_index_t *);
+void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+ int, const char *);
+
+/*
+ * Common routines for zfs and zpool property management
+ */
+int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
+int zprop_name_to_prop(const char *, zfs_type_t);
+int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
+int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+const char *zprop_values(int, zfs_type_t);
+size_t zprop_width(int, boolean_t *, zfs_type_t);
+boolean_t zprop_valid_for_type(int, zfs_type_t);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
new file mode 100644
index 000000000000..f5efe18d248b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
@@ -0,0 +1,186 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
+
+zprop_desc_t *
+zpool_prop_get_table(void)
+{
+ return (zpool_prop_table);
+}
+
+void
+zpool_prop_init(void)
+{
+ static zprop_index_t boolean_table[] = {
+ { "off", 0},
+ { "on", 1},
+ { NULL }
+ };
+
+ static zprop_index_t failuremode_table[] = {
+ { "wait", ZIO_FAILURE_MODE_WAIT },
+ { "continue", ZIO_FAILURE_MODE_CONTINUE },
+ { "panic", ZIO_FAILURE_MODE_PANIC },
+ { NULL }
+ };
+
+ /* string properties */
+ register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<path>", "ALTROOT");
+ register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
+ register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+
+ /* readonly number properties */
+ register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "SIZE");
+ register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "USED");
+ register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "AVAIL");
+ register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "CAP");
+ register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<guid>", "GUID");
+ register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<state>", "HEALTH");
+
+ /* default number properties */
+ register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+
+ /* default index (boolean) properties */
+ register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
+ register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+ register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
+
+ /* default index properties */
+ register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+ ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "wait | continue | panic", "FAILMODE", failuremode_table);
+
+ /* hidden properties */
+ register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool propety ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_name);
+}
+
+zprop_type_t
+zpool_prop_get_type(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_proptype);
+}
+
+boolean_t
+zpool_prop_readonly(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
+}
+
+const char *
+zpool_prop_default_string(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zpool_prop_default_numeric(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_numdefault);
+}
+
+int
+zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
+ uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
+}
+
+int
+zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
+ const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
+}
+
+#ifndef _KERNEL
+
+const char *
+zpool_prop_values(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_values);
+}
+
+const char *
+zpool_prop_column_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_colname);
+}
+
+boolean_t
+zpool_prop_align_right(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_rightalign);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
new file mode 100644
index 000000000000..87619e1cbf07
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Common routines used by zfs and zpool property management.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/libkern.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t *
+zprop_get_proptable(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (zpool_prop_get_table());
+ else
+ return (zfs_prop_get_table());
+}
+
+static int
+zprop_get_numprops(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (ZPOOL_NUM_PROPS);
+ else
+ return (ZFS_NUM_PROPS);
+}
+
+void
+register_impl(int prop, const char *name, zprop_type_t type,
+ uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
+{
+ zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
+ zprop_desc_t *pd;
+
+ pd = &prop_tbl[prop];
+
+ ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+
+ pd->pd_name = name;
+ pd->pd_propnum = prop;
+ pd->pd_proptype = type;
+ pd->pd_numdefault = numdefault;
+ pd->pd_strdefault = strdefault;
+ pd->pd_attr = attr;
+ pd->pd_types = objset_types;
+ pd->pd_values = values;
+ pd->pd_colname = colname;
+ pd->pd_rightalign = rightalign;
+ pd->pd_visible = visible;
+ pd->pd_table = idx_tbl;
+}
+
+void
+register_string(int prop, const char *name, const char *def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+ objset_types, values, colname, B_FALSE, B_TRUE, NULL);
+
+}
+
+void
+register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname)
+{
+ register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, NULL);
+}
+
+void
+register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ const zprop_index_t *idx_tbl)
+{
+ register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
+}
+
+void
+register_hidden(int prop, const char *name, zprop_type_t type,
+ zprop_attr_t attr, int objset_types, const char *colname)
+{
+ register_impl(prop, name, type, 0, NULL, attr,
+ objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
+}
+
+
+/*
+ * A comparison function we can use to order indexes into property tables.
+ */
+static int
+zprop_compare(const void *arg1, const void *arg2)
+{
+ const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
+ const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
+ boolean_t p1ro, p2ro;
+
+ p1ro = (p1->pd_attr == PROP_READONLY);
+ p2ro = (p2->pd_attr == PROP_READONLY);
+
+ if (p1ro == p2ro)
+ return (strcmp(p1->pd_name, p2->pd_name));
+
+ return (p1ro ? -1 : 1);
+}
+
+/*
+ * Iterate over all properties in the given property table, calling back
+ * into the specified function for each property. We will continue to
+ * iterate until we either reach the end or the callback function returns
+ * something other than ZPROP_CONT.
+ */
+int
+zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
+ boolean_t ordered, zfs_type_t type)
+{
+ int i, j, num_props, size, prop;
+ zprop_desc_t *prop_tbl;
+ zprop_desc_t **order;
+
+ prop_tbl = zprop_get_proptable(type);
+ num_props = zprop_get_numprops(type);
+ size = num_props * sizeof (zprop_desc_t *);
+
+#if defined(_KERNEL)
+ order = kmem_alloc(size, KM_SLEEP);
+#else
+ if ((order = malloc(size)) == NULL)
+ return (ZPROP_CONT);
+#endif
+
+ for (j = 0; j < num_props; j++)
+ order[j] = &prop_tbl[j];
+
+ if (ordered) {
+ qsort((void *)order, num_props, sizeof (zprop_desc_t *),
+ zprop_compare);
+ }
+
+ prop = ZPROP_CONT;
+ for (i = 0; i < num_props; i++) {
+ if ((order[i]->pd_visible || show_all) &&
+ (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
+ prop = order[i]->pd_propnum;
+ break;
+ }
+ }
+
+#if defined(_KERNEL)
+ kmem_free(order, size);
+#else
+ free(order);
+#endif
+ return (prop);
+}
+
+static boolean_t
+propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
+{
+ const char *propname = prop_entry->pd_name;
+#ifndef _KERNEL
+ const char *colname = prop_entry->pd_colname;
+ int c;
+
+ if (colname == NULL)
+ return (B_FALSE);
+#endif
+
+ if (len == strlen(propname) &&
+ strncmp(p, propname, len) == 0)
+ return (B_TRUE);
+
+#ifndef _KERNEL
+ if (len != strlen(colname))
+ return (B_FALSE);
+
+ for (c = 0; c < len; c++)
+ if (p[c] != tolower(colname[c]))
+ break;
+
+ return (colname[c] == '\0');
+#else
+ return (B_FALSE);
+#endif
+}
+
+typedef struct name_to_prop_cb {
+ const char *propname;
+ zprop_desc_t *prop_tbl;
+} name_to_prop_cb_t;
+
+static int
+zprop_name_to_prop_cb(int prop, void *cb_data)
+{
+ name_to_prop_cb_t *data = cb_data;
+
+ if (propname_match(data->propname, strlen(data->propname),
+ &data->prop_tbl[prop]))
+ return (prop);
+
+ return (ZPROP_CONT);
+}
+
+int
+zprop_name_to_prop(const char *propname, zfs_type_t type)
+{
+ int prop;
+ name_to_prop_cb_t cb_data;
+
+ cb_data.propname = propname;
+ cb_data.prop_tbl = zprop_get_proptable(type);
+
+ prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
+ B_TRUE, B_FALSE, type);
+
+ return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
+}
+
+int
+zprop_string_to_index(int prop, const char *string, uint64_t *index,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (strcmp(string, idx_tbl[i].pi_name) == 0) {
+ *index = idx_tbl[i].pi_value;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+int
+zprop_index_to_string(int prop, uint64_t index, const char **string,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (idx_tbl[i].pi_value == index) {
+ *string = idx_tbl[i].pi_name;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+const char *
+zprop_values(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+
+ return (prop_tbl[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zprop_valid_for_type(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (B_FALSE);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ return ((prop_tbl[prop].pd_types & type) != 0);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not. Only string columns are non-fixed.
+ */
+size_t
+zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl, *pd;
+ const zprop_index_t *idx;
+ size_t ret;
+ int i;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+ pd = &prop_tbl[prop];
+
+ *fixed = B_TRUE;
+
+ /*
+ * Start with the width of the column name.
+ */
+ ret = strlen(pd->pd_colname);
+
+ /*
+ * For fixed-width values, make sure the width is large enough to hold
+ * any possible value.
+ */
+ switch (pd->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ /*
+ * The maximum length of a human-readable number is 5 characters
+ * ("20.4M", for example).
+ */
+ if (ret < 5)
+ ret = 5;
+ /*
+ * 'creation' is handled specially because it's a number
+ * internally, but displayed as a date string.
+ */
+ if (prop == ZFS_PROP_CREATION)
+ *fixed = B_FALSE;
+ break;
+ case PROP_TYPE_INDEX:
+ idx = prop_tbl[prop].pd_table;
+ for (i = 0; idx[i].pi_name != NULL; i++) {
+ if (strlen(idx[i].pi_name) > ret)
+ ret = strlen(idx[i].pi_name);
+ }
+ break;
+
+ case PROP_TYPE_STRING:
+ *fixed = B_FALSE;
+ break;
+ }
+
+ return (ret);
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 1800e792fa1e..cf49c78a5b0e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -20,11 +20,9 @@
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This Makefile defines all file modules for the directory uts/common
# and its children. These are the source files which may be considered
# common to all SunOS systems.
@@ -46,7 +44,9 @@ ZFS_COMMON_OBJS += \
dsl_pool.o \
dsl_synctask.o \
dmu_zfetch.o \
+ dsl_deleg.o \
dsl_prop.o \
+ dsl_scrub.o \
fletcher.o \
gzip.o \
lzjb.o \
@@ -64,6 +64,7 @@ ZFS_COMMON_OBJS += \
unique.o \
vdev.o \
vdev_cache.o \
+ vdev_file.o \
vdev_label.o \
vdev_mirror.o \
vdev_missing.o \
@@ -75,6 +76,7 @@ ZFS_COMMON_OBJS += \
zap_micro.o \
zfs_byteswap.o \
zfs_fm.o \
+ zfs_fuid.o \
zfs_znode.o \
zil.o \
zio.o \
@@ -84,7 +86,11 @@ ZFS_COMMON_OBJS += \
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
- zfs_prop.o
+ zfs_deleg.o \
+ zfs_prop.o \
+ zfs_comutil.o \
+ zpool_prop.o \
+ zprop_common.o
ZFS_OBJS += \
$(ZFS_COMMON_OBJS) \
@@ -96,6 +102,7 @@ ZFS_OBJS += \
zfs_log.o \
zfs_replay.o \
zfs_rlock.o \
+ rrwlock.o \
zfs_vfsops.o \
zfs_vnops.o \
zvol.o
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index dd2aa82304ab..d9eb88a40202 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -20,7 +20,7 @@
*/
/* Portions Copyright 2007 Shivakumar GN */
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/mutex.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
+#include <sys/sunddi.h>
#include <sys/uio.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
@@ -60,7 +61,7 @@
*
* These routines are designed to play a support role for existing
* pseudo-filesystems (such as procfs). They simplify common tasks,
- * without enforcing the filesystem to hand over management to GFS. The
+ * without forcing the filesystem to hand over management to GFS. The
* routines covered are:
*
* gfs_readdir_init()
@@ -116,6 +117,42 @@
*/
/*
+ * gfs_get_parent_ino: used to obtain a parent inode number and the
+ * inode number of the given vnode in preparation for calling gfs_readdir_init.
+ */
+int
+gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ ino64_t *pino, ino64_t *ino)
+{
+ vnode_t *parent;
+ gfs_dir_t *dp = dvp->v_data;
+ int error;
+
+ *ino = dp->gfsd_file.gfs_ino;
+ parent = dp->gfsd_file.gfs_parent;
+
+ if (parent == NULL) {
+ *pino = *ino; /* root of filesystem */
+ } else if (dvp->v_flag & V_XATTRDIR) {
+#ifdef TODO
+ vattr_t va;
+
+ va.va_mask = AT_NODEID;
+ error = VOP_GETATTR(parent, &va, 0, cr, ct);
+ if (error)
+ return (error);
+ *pino = va.va_nodeid;
+#else
+ panic("%s:%u: not implemented", __func__, __LINE__);
+#endif
+ } else {
+ *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
+ }
+
+ return (0);
+}
+
+/*
* gfs_readdir_init: initiate a generic readdir
* st - a pointer to an uninitialized gfs_readdir_state_t structure
* name_max - the directory's maximum file name length
@@ -123,6 +160,7 @@
* uiop - the uiop passed to readdir
* parent - the parent directory's inode
* self - this directory's inode
+ * flags - flags from VOP_READDIR
*
* Returns 0 or a non-zero errno.
*
@@ -153,8 +191,10 @@
*/
int
gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
- uio_t *uiop, ino64_t parent, ino64_t self)
+ uio_t *uiop, ino64_t parent, ino64_t self, int flags)
{
+ size_t dirent_size;
+
if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
(uiop->uio_loffset % ureclen) != 0)
return (EINVAL);
@@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
st->grd_ureclen = ureclen;
st->grd_oresid = uiop->uio_resid;
st->grd_namlen = name_max;
- st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
+ if (flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
st->grd_parent = parent;
st->grd_self = self;
+ st->grd_flags = flags;
return (0);
}
@@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
/*
* gfs_readdir_emit_int: internal routine to emit directory entry
*
- * st - the current readdir state, which must have d_ino and d_name
- * set
+ * st - the current readdir state, which must have d_ino/ed_ino
+ * and d_name/ed_name set
* uiop - caller-supplied uio pointer
* next - the offset of the next entry
*/
@@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
int *ncookies, u_long **cookies)
{
int reclen, namlen;
+ dirent64_t *dp;
+ edirent_t *edp;
- namlen = strlen(st->grd_dirent->d_name);
- reclen = DIRENT64_RECLEN(namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp = st->grd_dirent;
+ namlen = strlen(edp->ed_name);
+ reclen = EDIRENT_RECLEN(namlen);
+ } else {
+ dp = st->grd_dirent;
+ namlen = strlen(dp->d_name);
+ reclen = DIRENT64_RECLEN(namlen);
+ }
if (reclen > uiop->uio_resid) {
/*
@@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
return (-1);
}
- /* XXX: This can change in the future. */
- st->grd_dirent->d_type = DT_DIR;
- st->grd_dirent->d_reclen = (ushort_t)reclen;
- st->grd_dirent->d_namlen = namlen;
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp->ed_off = next;
+ edp->ed_reclen = (ushort_t)reclen;
+ } else {
+ /* XXX: This can change in the future. */
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_type = DT_DIR;
+ dp->d_namlen = namlen;
+ }
if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
return (EFAULT);
@@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
* voff - the virtual offset (obtained from gfs_readdir_pred)
* ino - the entry's inode
* name - the entry's name
+ * eflags - value for ed_eflags (if processing edirent_t)
*
* Returns a 0 on success, a non-zero errno on failure, or -1 if the
* readdir loop should terminate. A non-zero result (either errno or
@@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
*/
int
gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
- ino64_t ino, const char *name, int *ncookies, u_long **cookies)
+ ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies)
{
offset_t off = (voff + 2) * st->grd_ureclen;
- st->grd_dirent->d_ino = ino;
- (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edirent_t *edp = st->grd_dirent;
+
+ edp->ed_ino = ino;
+ (void) strncpy(edp->ed_name, name, st->grd_namlen);
+ edp->ed_eflags = eflags;
+ } else {
+ dirent64_t *dp = st->grd_dirent;
+
+ dp->d_ino = ino;
+ (void) strncpy(dp->d_name, name, st->grd_namlen);
+ }
/*
* Inter-entry offsets are invalid, so we assume a record size of
@@ -266,11 +336,11 @@ top:
voff = off - 2;
if (off == 0) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
- ".", ncookies, cookies)) == 0)
+ ".", 0, ncookies, cookies)) == 0)
goto top;
} else if (off == 1) {
if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
- "..", ncookies, cookies)) == 0)
+ "..", 0, ncookies, cookies)) == 0)
goto top;
} else {
*voffp = voff;
@@ -292,7 +362,13 @@ top:
int
gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
{
- kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
+ size_t dirent_size;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ kmem_free(st->grd_dirent, dirent_size);
if (error > 0)
return (error);
if (eofp)
@@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp)
gfs_dir_t *dp = NULL;
void *data;
- if (fp->gfs_parent == NULL)
+ if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
goto found;
dp = fp->gfs_parent->v_data;
@@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp)
ge = NULL;
found:
+ if (vp->v_flag & V_XATTRDIR)
+ VI_LOCK(fp->gfs_parent);
VI_LOCK(vp);
ASSERT(vp->v_count < 2);
/*
@@ -535,7 +613,8 @@ found:
* Free vnode and release parent
*/
if (fp->gfs_parent) {
- gfs_dir_unlock(dp);
+ if (dp)
+ gfs_dir_unlock(dp);
VI_LOCK(fp->gfs_parent);
fp->gfs_parent->v_usecount--;
VI_UNLOCK(fp->gfs_parent);
@@ -543,6 +622,8 @@ found:
ASSERT(vp->v_vfsp != NULL);
VFS_RELE(vp->v_vfsp);
}
+ if (vp->v_flag & V_XATTRDIR)
+ VI_UNLOCK(fp->gfs_parent);
return (data);
}
@@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp)
}
/*
- * gfs_dir_lookup()
+ * gfs_dir_lookup_dynamic()
*
- * Looks up the given name in the directory and returns the corresponding vnode,
- * if found.
+ * This routine looks up the provided name amongst the dynamic entries
+ * in the gfs directory and returns the corresponding vnode, if found.
*
- * First, we search statically defined entries, if any. If a match is found,
- * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
- * existing vnode. Otherwise, we call the static entry's callback routine,
- * caching the result if necessary.
+ * The gfs directory is expected to be locked by the caller prior to
+ * calling this function. The directory will be unlocked during the
+ * execution of this function, but will be locked upon return from the
+ * function. This function returns 0 on success, non-zero on error.
*
- * If no static entry is found, we invoke the lookup callback, if any. The
- * arguments to this callback are:
+ * The dynamic lookups are performed by invoking the lookup
+ * callback, which is passed to this function as the first argument.
+ * The arguments to the callback are:
*
- * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ * int flags, int *deflgs, pathname_t *rpnp);
*
* pvp - parent vnode
* nm - name of entry
* vpp - pointer to resulting vnode
+ * cr - pointer to cred
+ * flags - flags value from lookup request
+ * ignored here; currently only used to request
+ * insensitive lookups
+ * direntflgs - output parameter, directory entry flags
+ * ignored here; currently only used to indicate a lookup
+ * has more than one possible match when case is not considered
+ * realpnp - output parameter, real pathname
+ * ignored here; when lookup was performed case-insensitively,
+ * this field contains the "real" name of the file.
*
* Returns 0 on success, non-zero on error.
*/
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
+static int
+gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
+ const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
+ int *direntflags, pathname_t *realpnp)
{
- int i;
- gfs_dirent_t *ge;
- vnode_t *vp;
- gfs_dir_t *dp = dvp->v_data;
- int ret = 0;
-
- ASSERT(dvp->v_type == VDIR);
+ gfs_file_t *fp;
+ ino64_t ino;
+ int ret;
- if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
- return (0);
+ ASSERT(GFS_DIR_LOCKED(dp));
+ /*
+ * Drop the directory lock, as the lookup routine
+ * will need to allocate memory, or otherwise deadlock on this
+ * directory.
+ */
+ gfs_dir_unlock(dp);
+ ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
gfs_dir_lock(dp);
/*
+ * The callback for extended attributes returns a vnode
+ * with v_data from an underlying fs.
+ */
+ if (ret == 0 && !IS_XATTRDIR(dvp)) {
+ fp = (gfs_file_t *)((*vpp)->v_data);
+ fp->gfs_index = -1;
+ fp->gfs_ino = ino;
+ }
+
+ return (ret);
+}
+
+/*
+ * gfs_dir_lookup_static()
+ *
+ * This routine looks up the provided name amongst the static entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ * The first argument to the function is a pointer to the comparison
+ * function this function should use to decide if names are a match.
+ *
+ * If a match is found, and GFS_CACHE_VNODE is set and the vnode
+ * exists, we simply return the existing vnode. Otherwise, we call
+ * the static entry's callback routine, caching the result if
+ * necessary. If the idx pointer argument is non-NULL, we use it to
+ * return the index of the matching static entry.
+ *
+ * The gfs directory is expected to be locked by the caller prior to calling
+ * this function. The directory may be unlocked during the execution of
+ * this function, but will be locked upon return from the function.
+ *
+ * This function returns 0 if a match is found, ENOENT if not.
+ */
+static int
+gfs_dir_lookup_static(int (*compare)(const char *, const char *),
+ gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
+ vnode_t **vpp, pathname_t *rpnp)
+{
+ gfs_dirent_t *ge;
+ vnode_t *vp = NULL;
+ int i;
+
+ ASSERT(GFS_DIR_LOCKED(dp));
+
+ /*
* Search static entries.
*/
for (i = 0; i < dp->gfsd_nstatic; i++) {
ge = &dp->gfsd_static[i];
- if (strcmp(ge->gfse_name, nm) == 0) {
+ if (compare(ge->gfse_name, nm) == 0) {
+ if (rpnp)
+ (void) strlcpy(rpnp->pn_buf, ge->gfse_name,
+ rpnp->pn_bufsize);
+
if (ge->gfse_vnode) {
ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
vp = ge->gfse_vnode;
VN_HOLD(vp);
- goto out;
+ break;
}
/*
@@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
* need to do KM_SLEEP allocations. If we return from
* the constructor only to find that a parallel
* operation has completed, and GFS_CACHE_VNODE is set
- * for this entry, we discard the result in favor of the
- * cached vnode.
+ * for this entry, we discard the result in favor of
+ * the cached vnode.
*/
gfs_dir_unlock(dp);
vp = ge->gfse_ctor(dvp);
@@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
gfs_dir_lock(dp);
}
}
-
- goto out;
+ break;
}
}
- /*
- * See if there is a dynamic constructor.
- */
- if (dp->gfsd_lookup) {
- ino64_t ino;
- gfs_file_t *fp;
+ if (vp == NULL)
+ return (ENOENT);
+ else if (idx)
+ *idx = i;
+ *vpp = vp;
+ return (0);
+}
- /*
- * Once again, drop the directory lock, as the lookup routine
- * will need to allocate memory, or otherwise deadlock on this
- * directory.
- */
- gfs_dir_unlock(dp);
- ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
- gfs_dir_lock(dp);
- if (ret != 0)
- goto out;
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding
+ * vnode, if found.
+ *
+ * First, we search statically defined entries, if any, with a call to
+ * gfs_dir_lookup_static(). If no static entry is found, and we have
+ * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
+ *
+ * This function returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ int flags, int *direntflags, pathname_t *realpnp)
+{
+ gfs_dir_t *dp = dvp->v_data;
+ boolean_t casecheck;
+ vnode_t *dynvp = NULL;
+ vnode_t *vp = NULL;
+ int (*compare)(const char *, const char *);
+ int error, idx;
- fp = (gfs_file_t *)vp->v_data;
- fp->gfs_index = -1;
- fp->gfs_ino = ino;
- } else {
- /*
- * No static entry found, and there is no lookup callback, so
- * return ENOENT.
- */
- ret = ENOENT;
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+ return (0);
+
+ casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
+ if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
+ (flags & FIGNORECASE))
+ compare = strcasecmp;
+ else
+ compare = strcmp;
+
+ gfs_dir_lock(dp);
+
+ error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
+
+ if (vp && casecheck) {
+ gfs_dirent_t *ge;
+ int i;
+
+ for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (strcasecmp(ge->gfse_name, nm) == 0) {
+ *direntflags |= ED_CASE_CONFLICT;
+ goto out;
+ }
+ }
+ }
+
+ if ((error || casecheck) && dp->gfsd_lookup)
+ error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
+ &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
+
+ if (vp && dynvp) {
+ /* static and dynamic entries are case-insensitive conflict */
+ ASSERT(casecheck);
+ *direntflags |= ED_CASE_CONFLICT;
+ VN_RELE(dynvp);
+ } else if (vp == NULL) {
+ vp = dynvp;
+ } else if (error == ENOENT) {
+ error = 0;
+ } else if (error) {
+ VN_RELE(vp);
+ vp = NULL;
}
out:
gfs_dir_unlock(dp);
- if (ret == 0)
- *vpp = vp;
- else
- *vpp = NULL;
-
- return (ret);
+ *vpp = vp;
+ return (error);
}
/*
@@ -731,13 +921,15 @@ out:
* This is significantly more complex, thanks to the particulars of
* VOP_READDIR().
*
- * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- * offset_t *off, offset_t *nextoff, void *data)
+ * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+ * offset_t *off, offset_t *nextoff, void *data, int flags)
*
* vp - directory vnode
* dp - directory entry, sized according to maxlen given to
* gfs_dir_create(). callback must fill in d_name and
- * d_ino.
+ * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
+ * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
+ * is set in 'flags'.
* eofp - callback must set to 1 when EOF has been reached
* off - on entry, the last offset read from the directory. Callback
* must set to the offset of the current entry, typically left
@@ -745,12 +937,13 @@ out:
* nextoff - callback must set to offset of next entry. Typically
* (off + 1)
* data - caller-supplied data
+ * flags - VOP_READDIR flags
*
* Return 0 on success, or error on failure.
*/
int
gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
- u_long **cookies, void *data)
+ u_long **cookies, void *data, cred_t *cr, int flags)
{
gfs_readdir_state_t gstate;
int error, eof = 0;
@@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
offset_t off, next;
gfs_dir_t *dp = dvp->v_data;
- ino = dp->gfsd_file.gfs_ino;
-
- if (dp->gfsd_file.gfs_parent == NULL)
- pino = ino; /* root of filesystem */
- else
- pino = ((gfs_file_t *)
- (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
+ error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino);
+ if (error)
+ return (error);
if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
- pino, ino)) != 0)
+ pino, ino, flags)) != 0)
return (error);
while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
@@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
ino = dp->gfsd_inode(dvp, off);
if ((error = gfs_readdir_emit(&gstate, uiop,
- off, ino, dp->gfsd_static[off].gfse_name, ncookies,
- cookies)) != 0)
+ off, ino, dp->gfsd_static[off].gfse_name, 0,
+ ncookies, cookies)) != 0)
break;
} else if (dp->gfsd_readdir) {
@@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
if ((error = dp->gfsd_readdir(dvp,
gstate.grd_dirent, &eof, &off, &next,
- data)) != 0 || eof)
+ data, flags)) != 0 || eof)
break;
off += dp->gfsd_nstatic + 2;
@@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
}
/*
+ * gfs_vop_lookup: VOP_LOOKUP() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_lookup() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
+}
+
+/*
* gfs_vop_readdir: VOP_READDIR() entry point
*
* For use directly in vnode ops table. Given a GFS directory, calls
@@ -827,6 +1031,7 @@ gfs_vop_readdir(ap)
{
vnode_t *vp = ap->a_vp;
uio_t *uiop = ap->a_uio;
+ cred_t *cr = ap->a_cred;
int *eofp = ap->a_eofflag;
int ncookies = 0;
u_long *cookies = NULL;
@@ -842,7 +1047,8 @@ gfs_vop_readdir(ap)
*ap->a_ncookies = ncookies;
}
- error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
+ error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL,
+ cr, 0);
if (error == 0) {
/* Subtract unused cookies */
@@ -882,6 +1088,9 @@ gfs_vop_inactive(ap)
if (data != NULL)
kmem_free(data, fp->gfs_size);
+
+ VI_LOCK(vp);
vp->v_data = NULL;
+ VI_UNLOCK(vp);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..00a10aae8ec9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+ bzero(xvap, sizeof (xvattr_t));
+ xvap->xva_mapsize = XVA_MAPSIZE;
+ xvap->xva_magic = XVA_MAGIC;
+ xvap->xva_vattr.va_mask = AT_XVATTR;
+ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure. Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ if (xvap->xva_vattr.va_mask & AT_XVATTR)
+ xoap = &xvap->xva_xoptattrs;
+ return (xoap);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 420f802f360d..7ca528033c4f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DVA-based Adjustable Replacement Cache
*
@@ -47,13 +45,13 @@
* There are times when it is not possible to evict the requested
* space. In these circumstances we are unable to adjust the cache
* size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
*
* 2. The Megiddo and Modha model assumes a fixed cache size.
* Pages are evicted when the cache is full and there is a cache
* miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory preasure from the
+ * high use, but also tries to react to memory pressure from the
* operating system: decreasing its size when system memory is
* tight.
*
@@ -75,7 +73,7 @@
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() inerface
+ * or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
* adjusting the cache use method 2. We therefor provide two
* types of locks: 1) the hash table lock array, and 2) the
@@ -109,6 +107,14 @@
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
*/
#include <sys/spa.h>
@@ -117,6 +123,7 @@
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
+#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan;
static int arc_dead;
/*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
* These tunables are for performance analysis.
*/
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+int zfs_mdcomp_disable = 0;
+
+TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
+TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
"Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
"Minimum ARC size");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
- * Note that buffers can be on one of 5 states:
+ * Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc. These are the
- * only buffers that can be evicted or deleted.
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
*
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
@@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
* they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
* as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
*/
typedef struct arc_state {
- list_t arcs_list; /* linked list of evictable buffer in state */
- uint64_t arcs_lsize; /* total size of buffers in the linked list */
- uint64_t arcs_size; /* total size of all buffers in this state */
+ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
+ uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
} arc_state_t;
-/* The 5 states: */
+/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
static arc_state_t ARC_mru_ghost;
static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
@@ -222,6 +259,24 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_memory_throttle_count;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -252,7 +307,25 @@ static arc_stats_t arc_stats = {
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 }
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -299,6 +372,7 @@ static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
/*
* There are several ARC variables that are critical to export as kstats --
@@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost;
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
+static uint64_t arc_meta_used;
+static uint64_t arc_meta_limit;
+static uint64_t arc_meta_max = 0;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
+ &arc_meta_used, 0, "ARC metadata used");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
+ &arc_meta_limit, 0, "ARC metadata limit");
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
- arc_byteswap_func_t *acb_byteswap;
arc_buf_t *acb_buf;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
@@ -368,6 +450,9 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
};
static arc_buf_t *arc_eviction_list;
@@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
/*
* Private ARC flags. These flags are private ARC only flags that will show up
@@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
+#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
+ (hdr)->b_l2hdr != NULL)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
/*
* Hash table routines
@@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table;
uint64_t zfs_crc64_table[256];
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_FEED_SECS 1 /* caching interval */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_boost; /* warmup write boost, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ daddr_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
{
uintptr_t spav = (uintptr_t)spa;
uint8_t *vdva = (uint8_t *)dva;
@@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
bzero(buf, sizeof (arc_buf_hdr_t));
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
return (0);
}
@@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
+ mutex_destroy(&buf->b_freeze_lock);
+
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ rw_destroy(&buf->b_lock);
}
/*
@@ -639,7 +854,7 @@ retry:
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
@@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
static void
-arc_cksum_compute(arc_buf_t *buf)
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
mutex_enter(&buf->b_hdr->b_freeze_lock);
@@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf)
void
arc_buf_thaw(arc_buf_t *buf)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
- if (buf->b_hdr->b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
mutex_enter(&buf->b_hdr->b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
@@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf)
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
}
static void
@@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
uint64_t delta = ab->b_size * ab->b_datacnt;
+ list_t *list = &ab->b_state->arcs_list[ab->b_type];
+ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
mutex_enter(&ab->b_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&ab->b_state->arcs_list, ab);
+ list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
ASSERT3U(ab->b_datacnt, ==, 0);
ASSERT3P(ab->b_buf, ==, NULL);
delta = ab->b_size;
}
ASSERT(delta > 0);
- ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
- atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
mutex_exit(&ab->b_state->arcs_mtx);
- /* remove the prefetch flag is we get a reference */
+ /* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
}
@@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
+ uint64_t *size = &state->arcs_lsize[ab->b_type];
+
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list, ab);
+ list_insert_head(&state->arcs_list[ab->b_type], ab);
ASSERT(ab->b_datacnt > 0);
- atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
- ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+ atomic_add_64(size, ab->b_size * ab->b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
@@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
if (refcnt == 0) {
if (old_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ uint64_t *size = &old_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list, ab);
+ list_remove(&old_state->arcs_list[ab->b_type], ab);
/*
* If prefetching out of the ghost cache,
@@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
from_delta = ab->b_size;
}
- ASSERT3U(old_state->arcs_lsize, >=, from_delta);
- atomic_add_64(&old_state->arcs_lsize, -from_delta);
+ ASSERT3U(*size, >=, from_delta);
+ atomic_add_64(size, -from_delta);
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
}
if (new_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ uint64_t *size = &new_state->arcs_lsize[ab->b_type];
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
- list_insert_head(&new_state->arcs_list, ab);
+ list_insert_head(&new_state->arcs_list[ab->b_type], ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(ab->b_buf == NULL);
to_delta = ab->b_size;
}
- atomic_add_64(&new_state->arcs_lsize, to_delta);
- ASSERT3U(new_state->arcs_size + to_delta, >=,
- new_state->arcs_lsize);
+ atomic_add_64(size, to_delta);
if (use_mutex)
mutex_exit(&new_state->arcs_mtx);
@@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && old_state != arc_anon) {
+ if (new_state == arc_anon) {
buf_hash_remove(ab);
}
@@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(&old_state->arcs_size, -from_delta);
}
ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space)
+{
+ atomic_add_64(&arc_meta_used, space);
+ atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space)
+{
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ atomic_add_64(&arc_meta_used, -space);
+ ASSERT(arc_size >= space);
+ atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+ if (arc_evict_needed(ARC_BUFC_DATA))
+ cv_signal(&arc_reclaim_thr_cv);
+ atomic_add_64(&arc_size, size);
+ return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+ zio_data_buf_free(buf, size);
+ ASSERT(arc_size >= size);
+ atomic_add_64(&arc_size, -size);
}
arc_buf_t *
@@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
hdr->b_spa = spa;
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
kmutex_t *hash_lock;
/*
- * Check to see if this buffer is currently being evicted via
- * arc_do_user_evicts().
+ * Check to see if this buffer is evicted. Callers
+ * must verify b_data != NULL to know if the add_ref
+ * was successful.
*/
- mutex_enter(&arc_eviction_mtx);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- mutex_exit(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_READER);
+ if (buf->b_data == NULL) {
+ rw_exit(&buf->b_lock);
return;
}
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
mutex_enter(hash_lock);
- if (buf->b_data == NULL) {
- /*
- * This buffer is evicted.
- */
- mutex_exit(hash_lock);
- return;
- }
+ rw_exit(&buf->b_lock);
- ASSERT(buf->b_hdr == hdr);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
arc_access(hdr, hash_lock);
@@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
data, metadata, hits);
}
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
static void
arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
{
@@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_cksum_verify(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
+ arc_space_return(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
+ atomic_add_64(&arc_size, -size);
}
- atomic_add_64(&arc_size, -size);
}
if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ uint64_t *cnt = &state->arcs_lsize[type];
+
ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
ASSERT(state != arc_anon);
- ASSERT3U(state->arcs_lsize, >=, size);
- atomic_add_64(&state->arcs_lsize, -size);
+
+ ASSERT3U(*cnt, >=, size);
+ atomic_add_64(cnt, -size);
}
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
@@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!(hdr->b_flags & ARC_STORED));
+
+ if (hdr->b_l2hdr != NULL) {
+ if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ if (hdr->b_l2hdr != NULL) {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
+ hdr);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+ } else {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ }
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
@@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- mutex_destroy(&hdr->b_freeze_lock);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf)
* - return the data block from this buffer rather than freeing it.
* This flag is used by callers that are trying to make space for a
* new buffer in a full arc cache.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
*/
static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type)
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
arc_buf_hdr_t *ab, *ab_prev = NULL;
+ list_t *list = &state->arcs_list[type];
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
@@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
+ (spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
skipped++;
@@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
+ if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ missed += 1;
+ break;
+ }
if (buf->b_data) {
bytes_evicted += ab->b_size;
if (recycle && ab->b_type == type &&
- ab->b_size == bytes) {
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
stolen = buf->b_data;
recycle = FALSE;
}
@@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
} else {
+ rw_exit(&buf->b_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
}
- ASSERT(ab->b_datacnt == 0);
- arc_change_state(evicted_state, ab, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags = ARC_IN_HASH_TABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (ab->b_datacnt == 0) {
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ }
if (!have_lock)
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
@@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
if (missed)
ARCSTAT_INCR(arcstat_mutex_miss, missed);
+ /*
+ * We have just evicted some date into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
+ */
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+
return (stolen);
}
@@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
@@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes)
ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ if (spa && ab->b_spa != spa)
+ continue;
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += ab->b_size;
- arc_hdr_destroy(ab);
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
@@ -1256,6 +1630,12 @@ top:
}
mutex_exit(&state->arcs_mtx);
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_deleted < bytes)) {
+ list = &state->arcs_list[ARC_BUFC_METADATA];
+ goto top;
+ }
+
if (bufs_skipped) {
ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
ASSERT(bytes >= 0);
@@ -1271,38 +1651,58 @@ arc_adjust(void)
{
int64_t top_sz, mru_over, arc_over, todelete;
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
- if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
- (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
}
mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
if (mru_over > 0) {
- if (arc_mru_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
- arc_evict_ghost(arc_mru_ghost, todelete);
+ if (arc_mru_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
}
}
if ((arc_over = arc_size - arc_c) > 0) {
int64_t tbl_over;
- if (arc_mfu->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
- (void) arc_evict(arc_mfu, toevict, FALSE,
- ARC_BUFC_UNDEF);
+ if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_DATA);
+ arc_over = arc_size - arc_c;
+ }
+
+ if (arc_over > 0 &&
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
+ arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
}
- tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
- arc_mfu_ghost->arcs_lsize - arc_c*2;
+ tbl_over = arc_size + arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c * 2;
- if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
- arc_evict_ghost(arc_mfu_ghost, todelete);
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
}
@@ -1314,7 +1714,9 @@ arc_do_user_evicts(void)
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
arc_eviction_list = buf->b_next;
+ rw_enter(&buf->b_lock, RW_WRITER);
buf->b_hdr = NULL;
+ rw_exit(&buf->b_lock);
mutex_exit(&arc_eviction_mtx);
if (buf->b_efunc != NULL)
@@ -1329,24 +1731,40 @@ arc_do_user_evicts(void)
}
/*
- * Flush all *evictable* data from the cache.
+ * Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
void
-arc_flush(void)
+arc_flush(spa_t *spa)
{
- while (list_head(&arc_mru->arcs_list))
- (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
- while (list_head(&arc_mfu->arcs_list))
- (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
- arc_evict_ghost(arc_mru_ghost, -1);
- arc_evict_ghost(arc_mfu_ghost, -1);
+ arc_evict_ghost(arc_mru_ghost, spa, -1);
+ arc_evict_ghost(arc_mfu_ghost, spa, -1);
mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
mutex_exit(&arc_reclaim_thr_lock);
- ASSERT(arc_eviction_list == NULL);
+ ASSERT(spa || arc_eviction_list == NULL);
}
int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
@@ -1380,7 +1798,7 @@ arc_shrink(void)
arc_adjust();
}
-static int zfs_needfree = 0;
+static int needfree = 0;
static int
arc_reclaim_needed(void)
@@ -1391,13 +1809,28 @@ arc_reclaim_needed(void)
#ifdef _KERNEL
- if (zfs_needfree)
+ if (needfree)
return (1);
#if 0
/*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
* check to make sure that swapfs has enough space so that anon
- * reservations can still succeeed. anon_resvmem() checks that the
+ * reservations can still succeed. anon_resvmem() checks that the
* availrmem is greater than swapfs_minfree, and the number of reserved
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
@@ -1405,23 +1838,6 @@ arc_reclaim_needed(void)
if (availrmem < swapfs_minfree + swapfs_reserve + extra)
return (1);
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then check that the size of available vmem for this area remains
- * above 1/4th free. This needs to be done when the size of the
- * non-default segment is smaller than physical memory, so we could
- * conceivably run out of VA in that segment before running out of
- * physical memory.
- */
- if (zio_arena != NULL) {
- size_t arc_ziosize =
- btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
- if ((physmem > arc_ziosize) &&
- (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
- return (1);
- }
-
#if defined(__i386)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
@@ -1431,7 +1847,7 @@ arc_reclaim_needed(void)
* can have in the system. However, this is generally fixed at 25 pages
* which is so low that it's useless. In this comparison, we seek to
* calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
if (btop(vmem_size(heap_arena, VMEM_FREE)) <
@@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
#ifdef _KERNEL
- /*
- * First purge some DNLC entries, in case the DNLC is using
- * up too much memory.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
+ if (arc_meta_used >= arc_meta_limit) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
#if defined(__i386)
/*
* Reclaim unused memory from all kmem caches.
@@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
/*
- * An agressive reclamation will shrink the cache size as well as
+ * An aggressive reclamation will shrink the cache size as well as
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
@@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused)
/* reset the growth delay for every reclaim */
growtime = LBOLT + (arc_grow_retry * hz);
- ASSERT(growtime > 0);
- if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+ if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
/*
- * If zfs_needfree is TRUE our vm_lowmem hook
+ * If needfree is TRUE our vm_lowmem hook
* was called and in that case we must free some
* memory, so switch to aggressive mode.
*/
@@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused)
last_reclaim = ARC_RECLAIM_AGGR;
}
arc_kmem_reap_now(last_reclaim);
- } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
+ arc_warm = B_TRUE;
+
+ } else if (arc_no_grow && LBOLT >= growtime) {
arc_no_grow = FALSE;
}
- if (zfs_needfree ||
+ if (needfree ||
(2 * arc_c < arc_size +
arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
arc_adjust();
@@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused)
arc_do_user_evicts();
if (arc_reclaim_needed()) {
- zfs_needfree = 0;
+ needfree = 0;
#ifdef _KERNEL
- wakeup(&zfs_needfree);
+ wakeup(&needfree);
#endif
}
@@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
+ if (state == arc_l2c_only)
+ return;
+
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
@@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state)
* prior to insert.
*/
static int
-arc_evict_needed()
+arc_evict_needed(arc_buf_contents_t type)
{
+ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+ return (1);
+
+#if 0
+#ifdef _KERNEL
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this area remains
+ * above about 1/32nd free.
+ */
+ if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+ vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+ return (1);
+#endif
+#endif
+
if (arc_reclaim_needed())
return (1);
@@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf)
* We have not yet reached cache maximum size,
* just allocate a new buffer.
*/
- if (!arc_evict_needed()) {
+ if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
goto out;
}
@@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf)
if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+ state = (arc_mfu->arcs_lsize[type] > 0 &&
+ arc_p > mru_used) ? arc_mfu : arc_mru;
} else {
/* MFU cases */
uint64_t mfu_space = arc_c - arc_p;
- state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ state = (arc_mru->arcs_lsize[type] > 0 &&
+ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
}
- atomic_add_64(&arc_size, size);
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
@@ -1728,7 +2170,7 @@ out:
atomic_add_64(&hdr->b_state->arcs_size, size);
if (list_link_active(&hdr->b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize, size);
+ atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
}
/*
* If we are growing the cache, and we are adding anonymous
@@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
if (refcount_count(&buf->b_refcnt) == 0) {
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mru->arcs_mtx);
- list_remove(&arc_mru->arcs_list, buf);
- list_insert_head(&arc_mru->arcs_list, buf);
- mutex_exit(&arc_mru->arcs_mtx);
} else {
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
@@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if ((buf->b_flags & ARC_PREFETCH) != 0) {
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mfu->arcs_mtx);
- list_remove(&arc_mfu->arcs_list, buf);
- list_insert_head(&arc_mfu->arcs_list, buf);
- mutex_exit(&arc_mfu->arcs_mtx);
}
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = LBOLT;
@@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = LBOLT;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
@@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_done_func_t */
void
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
@@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio)
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~ARC_L2_EVICTED;
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags &= ~ARC_L2CACHE;
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
- callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+ byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+ func(buf->b_data, hdr->b_size);
+ }
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
/* create copies of the data buffer for the callers */
abuf = buf;
@@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
- /* convert checksum errors into IO errors */
- if (zio->io_error == ECKSUM)
- zio->io_error = EIO;
}
/*
@@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio)
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp. But if you know you don't need locking, you can use
+ * arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t *arc_flags, zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ int err;
+ arc_buf_hdr_t *hdr = pbuf->b_hdr;
+
+ ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+ ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+ rw_enter(&pbuf->b_lock, RW_READER);
+
+ err = arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb);
+
+ ASSERT3P(hdr, ==, pbuf->b_hdr);
+ rw_exit(&pbuf->b_lock);
+ return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
kmutex_t *hash_lock;
- zio_t *rzio;
+ zio_t *rzio;
top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
@@ -2053,10 +2525,9 @@ top:
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, flags);
+ spa, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
acb->acb_next = hdr->b_acb;
@@ -2093,6 +2564,8 @@ top:
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2104,6 +2577,8 @@ top:
} else {
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ daddr_t addr;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -2130,6 +2605,8 @@ top:
private);
hdr->b_flags |= ARC_PREFETCH;
}
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_INDIRECT;
} else {
@@ -2144,7 +2621,9 @@ top:
hdr->b_flags |= ARC_PREFETCH;
else
add_reference(hdr, hash_lock, private);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
@@ -2160,7 +2639,6 @@ top:
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- acb->acb_byteswap = swap;
ASSERT(hdr->b_acb == NULL);
hdr->b_acb = acb;
@@ -2176,6 +2654,18 @@ top:
if (GHOST_STATE(hdr->b_state))
arc_access(hdr, hash_lock);
+
+ if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
+ (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ addr = hdr->b_l2hdr->b_daddr;
+ /*
+ * Lock out device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
@@ -2186,8 +2676,65 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
+ if (vd != NULL) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ */
+ if (hdr->b_l2hdr != NULL &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority, zio_flags |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+
+ if (*arc_flags & ARC_NOWAIT) {
+ zio_nowait(rzio);
+ return (0);
+ }
+
+ ASSERT(*arc_flags & ARC_WAIT);
+ if (zio_wait(rzio) == 0)
+ return (0);
+
+ /* l2arc read error; goto zio_read() */
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ }
+
rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, flags, zb);
+ arc_read_done, buf, priority, zio_flags, zb);
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf)
kmutex_t *hash_lock;
arc_buf_t **bufp;
- mutex_enter(&arc_eviction_mtx);
+ rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- mutex_exit(&arc_eviction_mtx);
+ rw_exit(&buf->b_lock);
return (0);
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
- mutex_enter(hash_lock);
-
- if (buf->b_data == NULL) {
+ } else if (buf->b_data == NULL) {
+ arc_buf_t copy = *buf; /* structure assignment */
/*
- * We are on the eviction list.
+ * We are on the eviction list; process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
*/
- mutex_exit(hash_lock);
- mutex_enter(&arc_eviction_mtx);
- if (buf->b_hdr == NULL) {
- /*
- * We are already in arc_do_user_evicts().
- */
- mutex_exit(&arc_eviction_mtx);
- return (0);
- } else {
- arc_buf_t copy = *buf; /* structure assignment */
- /*
- * Process this buffer now
- * but let arc_do_user_evicts() do the reaping.
- */
- buf->b_efunc = NULL;
- mutex_exit(&arc_eviction_mtx);
- VERIFY(copy.b_efunc(&copy) == 0);
- return (1);
- }
+ buf->b_efunc = NULL;
+ rw_exit(&buf->b_lock);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
}
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
@@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf)
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags = ARC_IN_HASH_TABLE;
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&old_state->arcs_mtx);
}
mutex_exit(hash_lock);
+ rw_exit(&buf->b_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
@@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf)
* Release this buffer from the cache. This must be done
* after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
+ * a new hdr for the buffer.
*/
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ l2arc_buf_hdr_t *l2hdr;
+ uint64_t buf_size;
+
+ rw_enter(&buf->b_lock, RW_WRITER);
+ hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
@@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf);
+ rw_exit(&buf->b_lock);
return;
}
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ l2hdr = hdr->b_l2hdr;
+ if (l2hdr) {
+ mutex_enter(&l2arc_buflist_mtx);
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+
/*
* Do we have more than one buf?
*/
- if (hdr->b_buf != buf || buf->b_next != NULL) {
+ if (hdr->b_datacnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
- ASSERT(hdr->b_datacnt > 1);
+ ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
* Pull the data off of this buf and attach it to
* a new anonymous buf.
@@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
if (refcount_is_zero(&hdr->b_refcnt)) {
- ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3U(*size, >=, hdr->b_size);
+ atomic_add_64(size, -hdr->b_size);
}
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
nhdr->b_type = type;
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
- nhdr->b_flags = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
nhdr->b_freeze_cksum = NULL;
- mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
+ rw_exit(&buf->b_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
-
- hdr = nhdr;
} else {
+ rw_exit(&buf->b_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
mutex_exit(hash_lock);
+
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
@@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag)
}
buf->b_efunc = NULL;
buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ mutex_exit(&l2arc_buflist_mtx);
+ }
}
int
arc_released(arc_buf_t *buf)
{
- return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ int released;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ rw_exit(&buf->b_lock);
+ return (released);
}
int
arc_has_callback(arc_buf_t *buf)
{
- return (buf->b_efunc != NULL);
+ int callback;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ callback = (buf->b_efunc != NULL);
+ rw_exit(&buf->b_lock);
+ return (callback);
}
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
{
- return (refcount_count(&buf->b_hdr->b_refcnt));
+ int referenced;
+
+ rw_enter(&buf->b_lock, RW_READER);
+ referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ rw_exit(&buf->b_lock);
+ return (referenced);
}
#endif
@@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- if (callback->awcb_ready) {
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
- callback->awcb_ready(zio, buf, callback->awcb_private);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum.
+ * It is the responsibility of the callback to handle the
+ * accounting for any re-write attempt.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
}
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
static void
@@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio)
hdr->b_acb = NULL;
- /* this buffer is on no lists and is not in the hash table */
- ASSERT3P(hdr->b_state, ==, arc_anon);
-
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = zio->io_bp->blk_birth;
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio)
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
BP_IDENTITY(zio->io_bp)));
ASSERT3U(zio->io_bp_orig.blk_birth, ==,
@@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio)
ASSERT3P(exists, ==, NULL);
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- arc_access(hdr, hash_lock);
+ /* if it's not anon, we are doing a scrub */
+ if (hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else if (callback->awcb_done == NULL) {
int destroy_hdr;
@@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio)
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
+ hdr->b_flags &= ~ARC_STORED;
if (callback->awcb_done) {
ASSERT(!refcount_is_zero(&hdr->b_refcnt));
@@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio)
kmem_free(callback, sizeof (arc_write_callback_t));
}
+static void
+write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
+{
+ boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
+
+ /* Determine checksum setting */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
+ !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
+ zp->zp_checksum = wp->wp_oschecksum;
+ else
+ zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
+ wp->wp_oschecksum);
+ }
+
+ /* Determine compression setting */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ zp->zp_compress = zio_compress_select(wp->wp_dncompress,
+ wp->wp_oscompress);
+ }
+
+ zp->zp_type = wp->wp_type;
+ zp->zp_level = wp->wp_level;
+ zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
+}
+
zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+ boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
- zio_t *zio;
+ zio_t *zio;
+ zio_prop_t zp;
- /* this is a private buffer - no locking required */
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(ready != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
ASSERT(hdr->b_acb == 0);
+ if (l2arc)
+ hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
- zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
- buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
- priority, flags, zb);
+
+ write_policy(spa, wp, &zp);
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
@@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
* nonzero, it should match what we have in the cache.
*/
ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
+ bp->blk_fill == BLK_FILL_ALREADY_FREED);
+
if (ab->b_state != arc_anon)
arc_change_state(arc_anon, ab, hash_lock);
if (HDR_IO_IN_PROGRESS(ab)) {
@@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
} else if (refcount_is_zero(&ab->b_refcnt)) {
+ ab->b_flags |= ARC_FREE_IN_PROGRESS;
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
ARCSTAT_BUMP(arcstat_deleted);
@@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
}
- zio = zio_free(pio, spa, txg, bp, done, private);
+ zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
if (arc_flags & ARC_WAIT)
return (zio_wait(zio));
@@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
return (0);
}
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t inflight_data = arc_anon->arcs_size;
+ uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
+ static uint64_t page_load = 0;
+ static uint64_t last_txg = 0;
+
+#if 0
+#if defined(__i386)
+ available_memory =
+ MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+#endif
+ if (available_memory >= zfs_write_limit_max)
+ return (0);
+
+ if (txg > last_txg) {
+ last_txg = txg;
+ page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == pageproc) {
+ if (page_load > available_memory / 4)
+ return (ERESTART);
+ /* Note: reserve is inflated, so we deflate */
+ page_load += reserve / 8;
+ return (0);
+ } else if (page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (EAGAIN);
+ }
+ page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (ERESTART);
+ }
+#endif
+ return (0);
+}
+
void
-arc_tempreserve_clear(uint64_t tempreserve)
+arc_tempreserve_clear(uint64_t reserve)
{
- atomic_add_64(&arc_tempreserve, -tempreserve);
+ atomic_add_64(&arc_tempreserve, -reserve);
ASSERT((int64_t)arc_tempreserve >= 0);
}
int
-arc_tempreserve_space(uint64_t tempreserve)
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
+ int error;
+
#ifdef ZFS_DEBUG
/*
* Once in a while, fail for no reason. Everything should cope.
@@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve)
return (ERESTART);
}
#endif
- if (tempreserve > arc_c/4 && !arc_no_grow)
- arc_c = MIN(arc_c_max, tempreserve * 4);
- if (tempreserve > arc_c)
+ if (reserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, reserve * 4);
+ if (reserve > arc_c)
return (ENOMEM);
/*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefor need to
+ * make sure that there is sufficient available memory for this.
+ */
+ if (error = arc_memory_throttle(reserve, txg))
+ return (error);
+
+ /*
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
- *
- * XXX The limit should be adjusted dynamically to keep the time
- * to sync a dataset fixed (around 1-5 seconds?).
*/
-
- if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
- dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
- "tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
- tempreserve>>10, arc_c>>10);
+ if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+ reserve>>10, arc_c>>10);
return (ERESTART);
}
- atomic_add_64(&arc_tempreserve, tempreserve);
+ atomic_add_64(&arc_tempreserve, reserve);
return (0);
}
@@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused)
/* Serialize access via arc_lowmem_lock. */
mutex_enter(&arc_lowmem_lock);
- zfs_needfree = 1;
+ needfree = 1;
cv_signal(&arc_reclaim_thr_cv);
- while (zfs_needfree)
- tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+ while (needfree)
+ tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
mutex_exit(&arc_lowmem_lock);
}
#endif
@@ -2743,6 +3442,16 @@ arc_init(void)
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -2757,6 +3466,7 @@ arc_init(void)
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -2764,15 +3474,28 @@ arc_init(void)
mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
buf_init();
@@ -2798,6 +3521,13 @@ arc_init(void)
#endif
arc_dead = FALSE;
+ arc_warm = B_FALSE;
+
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
#ifdef _KERNEL
/* Warn about ZFS memory and address space requirements. */
@@ -2808,9 +3538,9 @@ arc_init(void)
if (kmem_size() < 512 * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
"expect unstable behavior.\n");
- printf(" Consider tuning vm.kmem_size and "
+ printf(" Consider tuning vm.kmem_size and "
"vm.kmem_size_max\n");
- printf(" in /boot/loader.conf.\n");
+ printf(" in /boot/loader.conf.\n");
}
#endif
}
@@ -2818,6 +3548,7 @@ arc_init(void)
void
arc_fini(void)
{
+
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
cv_signal(&arc_reclaim_thr_cv);
@@ -2825,7 +3556,7 @@ arc_fini(void)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
mutex_exit(&arc_reclaim_thr_lock);
- arc_flush();
+ arc_flush(NULL);
arc_dead = TRUE;
@@ -2838,10 +3569,14 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc_mru->arcs_list);
- list_destroy(&arc_mru_ghost->arcs_list);
- list_destroy(&arc_mfu->arcs_list);
- list_destroy(&arc_mfu_ghost->arcs_list);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
mutex_destroy(&arc_anon->arcs_mtx);
mutex_destroy(&arc_mru->arcs_mtx);
@@ -2849,6 +3584,8 @@ arc_fini(void)
mutex_destroy(&arc_mfu->arcs_mtx);
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ mutex_destroy(&zfs_write_limit_lock);
+
buf_fini();
mutex_destroy(&arc_lowmem_lock);
@@ -2857,3 +3594,985 @@ arc_fini(void)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
#endif
}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev));
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev))
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ l2arc_buf_hdr_t *abl2;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, ab);
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ int equal;
+
+ ASSERT(zio->io_vd != NULL);
+ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ buf = cb->l2rcb_buf;
+ ASSERT(buf != NULL);
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ equal = arc_cksum_equal(buf);
+ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = buf;
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+ arc_read_done(zio);
+ } else {
+ mutex_exit(hash_lock);
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ } else {
+ zio->io_error = EIO;
+ }
+ if (!equal)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ /*
+ * If there's no waiter, issue an async i/o to the primary
+ * storage now. If there *is* a waiter, the caller must
+ * issue the i/o in a context where it's OK to block.
+ */
+ if (zio->io_waiter == NULL)
+ zio_nowait(zio_read(zio->io_parent,
+ cb->l2rcb_spa, &cb->l2rcb_bp,
+ buf->b_data, zio->io_size, arc_read_done, buf,
+ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+ list_t *list;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 1:
+ list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ case 2:
+ list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 3:
+ list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ }
+
+ ASSERT(!(MUTEX_HELD(*lock)));
+ mutex_enter(*lock);
+ return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ buflist = dev->l2ad_buflist;
+
+ if (buflist == NULL)
+ return;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&l2arc_buflist_mtx);
+ for (ab = list_tail(buflist); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&l2arc_buflist_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ if (HDR_L2_WRITE_HEAD(ab)) {
+ /*
+ * We hit a write head node. Leave it for
+ * l2arc_write_done().
+ */
+ list_remove(buflist, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (!all && ab->b_l2hdr != NULL &&
+ (ab->b_l2hdr->b_daddr > taddr ||
+ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (HDR_FREE_IN_PROGRESS(ab)) {
+ /*
+ * Already on the path to destruction.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_state == arc_l2c_only) {
+ ASSERT(!HDR_L2_READING(ab));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_size.
+ */
+ arc_change_state(arc_anon, ab, hash_lock);
+ arc_hdr_destroy(ab);
+ } else {
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(ab)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ ab->b_flags |= ARC_L2_EVICTED;
+ }
+
+ /*
+ * Tell ARC this no longer exists in L2ARC.
+ */
+ if (ab->b_l2hdr != NULL) {
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+ list_remove(buflist, ab);
+
+ /*
+ * This may have been leftover after a
+ * failed write.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static void
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+ arc_buf_hdr_t *ab, *ab_prev, *head;
+ l2arc_buf_hdr_t *hdrl2;
+ list_t *list;
+ uint64_t passed_sz, write_sz, buf_sz, headroom;
+ void *buf_data;
+ kmutex_t *hash_lock, *list_lock;
+ boolean_t have_lock, full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+ int try;
+
+ ASSERT(dev->l2ad_vdev != NULL);
+
+ pio = NULL;
+ write_sz = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head->b_flags |= ARC_L2_WRITE_HEAD;
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ for (try = 0; try <= 3; try++) {
+ list = l2arc_list_locked(try, &list_lock);
+ passed_sz = 0;
+
+ /*
+ * L2ARC fast warmup.
+ *
+ * Until the ARC is warm and starts to evict, read from the
+ * head of the ARC lists rather than the tail.
+ */
+ headroom = target_sz * l2arc_headroom;
+ if (arc_warm == B_FALSE)
+ ab = list_head(list);
+ else
+ ab = list_tail(list);
+
+ for (; ab; ab = ab_prev) {
+ if (arc_warm == B_FALSE)
+ ab_prev = list_next(list, ab);
+ else
+ ab_prev = list_prev(list, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (!have_lock && !mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += ab->b_size;
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_spa != spa) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if ((write_sz + ab->b_size) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_buf == NULL) {
+ DTRACE_PROBE1(l2arc__buf__null, void *, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ list_insert_head(dev->l2ad_buflist, head);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ /*
+ * Create and add a new L2ARC header.
+ */
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2->b_dev = dev;
+ hdrl2->b_daddr = dev->l2ad_hand;
+
+ ab->b_flags |= ARC_L2_WRITING;
+ ab->b_l2hdr = hdrl2;
+ list_insert_head(dev->l2ad_buflist, ab);
+ buf_data = ab->b_buf->b_data;
+ buf_sz = ab->b_size;
+
+ /*
+ * Compute and store the buffer cksum before
+ * writing. On debug the cksum is verified first.
+ */
+ arc_cksum_verify(ab->b_buf);
+ arc_cksum_compute(ab->b_buf, B_TRUE);
+
+ mutex_exit(hash_lock);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ /*
+ * Keep the clock hand suitably device-aligned.
+ */
+ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+ write_sz += buf_sz;
+ dev->l2ad_hand += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ if (pio == NULL) {
+ ASSERT3U(write_sz, ==, 0);
+ kmem_cache_free(hdr_cache, head);
+ return;
+ }
+
+ ASSERT3U(write_sz, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_size, write_sz);
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+ spa_l2cache_space_update(dev->l2ad_vdev, 0,
+ dev->l2ad_end - dev->l2ad_hand);
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ (void) zio_wait(pio);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void *dummy __unused)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ uint64_t size;
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ /*
+ * Pause for l2arc_feed_secs seconds between writes.
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ hz * l2arc_feed_secs);
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+ /*
+ * Quick check for L2ARC devices.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa. This
+ * will return NULL if there are now no l2arc devices or if
+ * they are all faulted.
+ *
+ * If a device is returned, its spa's config lock is also
+ * held to prevent device removal. l2arc_dev_get_next()
+ * will grab and release l2arc_dev_mtx.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL)
+ continue;
+
+ spa = dev->l2ad_spa;
+ ASSERT(spa != NULL);
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ size = dev->l2ad_write;
+ if (arc_warm == B_FALSE)
+ size += dev->l2ad_boost;
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, size, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ l2arc_write_buffers(spa, dev, size);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
+
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev != NULL;
+ dev = list_next(l2arc_dev_list, dev)) {
+ if (dev->l2ad_vdev == vd)
+ break;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ return (dev != NULL);
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+{
+ l2arc_dev_t *adddev;
+
+ ASSERT(!l2arc_vdev_present(vd));
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_write = l2arc_write_max;
+ adddev->l2ad_boost = l2arc_write_boost;
+ adddev->l2ad_start = start;
+ adddev->l2ad_end = end;
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ ASSERT3U(adddev->l2ad_write, >, 0);
+
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2node));
+
+ spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT(remdev != NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(remdev->l2ad_buflist);
+ kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+ /*
+ * This is called from dmu_fini(), which is called from spa_fini();
+ * Because of this, we can assume that all l2arc devices have
+ * already been removed when the pools themselves were removed.
+ */
+
+ l2arc_do_free_on_write();
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_buflist_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+ if (!(spa_mode & FWRITE))
+ return;
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+ if (!(spa_mode & FWRITE))
+ return;
+
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
index 4442b1f28ac8..93b7741d77be 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/bplist.h>
#include <sys/zfs_context.h>
@@ -47,7 +45,7 @@ bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
{
int size;
- size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
+ size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
@@ -181,7 +179,7 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
}
int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
{
uint64_t blk, off;
blkptr_t *bparray;
@@ -229,7 +227,7 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
* Deferred entry; will be written later by bplist_sync().
*/
void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
{
bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
@@ -278,9 +276,7 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
int
bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
- uint64_t itor = 0, comp = 0, uncomp = 0;
int err;
- blkptr_t bp;
mutex_enter(&bpl->bpl_lock);
@@ -298,6 +294,9 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
mutex_exit(&bpl->bpl_lock);
if (!bpl->bpl_havecomp) {
+ uint64_t itor = 0, comp = 0, uncomp = 0;
+ blkptr_t bp;
+
while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
comp += BP_GET_PSIZE(&bp);
uncomp += BP_GET_UCSIZE(&bp);
@@ -310,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
return (err);
}
+
+/*
+ * Return (in *dasizep) the amount of space on the deadlist which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *dasizep)
+{
+ uint64_t size = 0;
+ uint64_t itor = 0;
+ blkptr_t bp;
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpl_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
+ mutex_enter(&bpl->bpl_lock);
+ err = bplist_hold(bpl);
+ if (err == 0)
+ *dasizep = bpl->bpl_phys->bpl_bytes;
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+ if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
+ size +=
+ bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+ }
+ }
+ if (err == ENOENT)
+ err = 0;
+ *dasizep = size;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 94c63081478a..2494c1e7f9d1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@@ -39,17 +37,10 @@
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static arc_done_func_t dbuf_write_ready;
static arc_done_func_t dbuf_write_done;
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
/*
* Global data structures and functions for the dbuf cache.
*/
@@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- blkptr_t *bp;
+ dnode_t *dn = db->db_dnode;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
+ arc_buf_t *pbuf;
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ int bonuslen = dn->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
+ arc_space_consume(DN_MAX_BONUSLEN);
+ if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+ bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
}
- if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
- bp = NULL;
- else
- bp = db->db_blkptr;
-
- if (bp == NULL)
- dprintf_dbuf(db, "blkptr: %s\n", "NULL");
- else
- dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
- if (bp == NULL || BP_IS_HOLE(bp)) {
+ /*
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- ASSERT(bp == NULL || BP_IS_HOLE(bp));
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
@@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_L2CACHE;
+
zb.zb_objset = db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : 0;
zb.zb_object = db->db.db_object;
@@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
- ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
- (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
- db->db_level > 0 ? byteswap_uint64_array :
- dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+ if (db->db_parent)
+ pbuf = db->db_parent->db_buf;
+ else
+ pbuf = db->db_objset->os_phys_buf;
+
+ (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
@@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DB_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
@@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/* free this block */
if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
/* XXX can get silent EIO here */
- (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ (void) dsl_free(NULL,
+ spa_get_dsl(db->db_dnode->dn_objset->os_spa),
txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
arc_release(dr->dt.dl.dr_data, db);
}
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t first_l1 = start >> epbs;
+ uint64_t last_l1 = end >> epbs;
- dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ if (end > dn->dn_maxblkid) {
+ end = dn->dn_maxblkid;
+ last_l1 = end >> epbs;
+ }
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ if (db->db_level == 1 &&
+ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_last_dirty &&
+ db->db_last_dirty->dr_txg < txg) {
+ dbuf_add_ref(db, FTAG);
+ mutex_exit(&db->db_mtx);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < blkid ||
- db->db_blkid >= blkid+nblks)
+ if (db->db_blkid < start || db->db_blkid > end)
continue;
/* found a level 0 buffer in the range */
@@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
if (db->db_last_dirty)
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
+ /* If we don't exist or are in a snapshot, we can't be freed */
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
else
- return (TRUE);
+ return (FALSE);
}
void
@@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
+ boolean_t do_free_accounting = B_FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(tx->tx_txg != 0);
@@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drp = &db->db_last_dirty;
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
db->db.db_object == DMU_META_DNODE_OBJECT);
- while (*drp && (*drp)->dr_txg > tx->tx_txg)
- drp = &(*drp)->dr_next;
- if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
- dbuf_unoverride(*drp);
+ dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
- return (*drp);
+ return (dr);
}
/*
@@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ * Note: we delay "free accounting" until after we drop
+ * the db_mtx. This keeps us from grabbing other locks
+ * (and possibly deadlocking) in bp_get_dasize() while
+ * also holding the db_mtx.
+ */
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ do_free_accounting = dbuf_block_freeable(db);
+ }
+
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
@@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_freed_in_flight = FALSE;
}
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
/*
* This buffer is now part of this txg
*/
@@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
return (dr);
- }
-
- if (db->db_level == 0) {
- dnode_new_blkid(dn, db->db_blkid, tx);
- ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ } else if (do_free_accounting) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
}
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
@@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drop_struct_lock = TRUE;
}
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
if (db->db_level+1 < dn->dn_nlevels) {
dmu_buf_impl_t *parent = db->db_parent;
dbuf_dirty_record_t *di;
@@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
+ dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* If this buffer is not dirty, we're done.
*/
- for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
if (dr->dr_txg <= txg)
break;
if (dr == NULL || dr->dr_txg < txg) {
@@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/* XXX would be nice to fix up dn_towrite_space[] */
- db->db_last_dirty = dr->dr_next;
+ *drp = dr->dr_next;
if (dr->dr_parent) {
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_level+1 == dn->dn_nlevels) {
- ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
@@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
void
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- int rf = DB_RF_MUST_SUCCEED;
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID)
+ if (db->db_blkid == DB_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
}
@@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
if (db->db_buf)
@@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t));
return (db);
} else {
int blocksize =
@@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
list_insert_head(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t));
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
@@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
-
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (list_link_active(&db->db_link)) {
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
- db->db_dnode = NULL;
db->db_buf = NULL;
+ ASSERT(!list_link_active(&db->db_link));
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t));
}
void
@@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ arc_buf_t *pbuf;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
zb.zb_level = 0;
zb.zb_blkid = blkid;
- (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
- dmu_ot[dn->dn_type].ot_byteswap,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ if (db)
+ pbuf = db->db_buf;
+ else
+ pbuf = dn->dn_objset->os_phys_buf;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
return (err ? NULL : db);
}
-dmu_buf_impl_t *
+void
dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_evict(db);
} else {
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
- mutex_exit(&db->db_mtx);
+ if (!DBUF_IS_CACHEABLE(db))
+ dbuf_clear(db);
+ else
+ mutex_exit(&db->db_mtx);
}
} else {
mutex_exit(&db->db_mtx);
@@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
db->db_data_pending = dr;
- arc_release(db->db_buf, db);
mutex_exit(&db->db_mtx);
-
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
- zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+ dbuf_write(dr, db->db_buf, tx);
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
@@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- int checksum, compress;
int blksz;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
if (db->db_blkid == DB_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
+
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
- if (*datap != db->db.db_data)
+ if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
while (*drp != dr)
drp = &(*drp)->dr_next;
- ASSERT((*drp)->dr_next == NULL);
- *drp = NULL;
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
/*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
* If this buffer is in the middle of an immdiate write,
* wait for the synchronous IO to complete.
*/
@@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- dbuf_check_blkptr(dn, db);
-
/*
* If this dbuf has already been written out via an immediate write,
* just complete the write by copying over the new block pointer and
@@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
zio_fake.io_bp = db->db_blkptr;
zio_fake.io_bp_orig = *db->db_blkptr;
zio_fake.io_txg = txg;
+ zio_fake.io_flags = 0;
*db->db_blkptr = dr->dt.dl.dr_overridden_by;
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dr->dr_zio = &zio_fake;
mutex_exit(&db->db_mtx);
+ ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
+ BP_IDENTITY(&zio_fake.io_bp_orig)) ||
+ BP_IS_HOLE(zio_fake.io_bp));
+
if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
&zio_fake.io_bp_orig, dn->dn_zio, tx);
dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
- } else {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(db->db_buf, db);
}
ASSERT(*datap != NULL);
@@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
-
- dbuf_write(dr, *datap, checksum, compress, tx);
+ dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
}
static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn = db->db_dnode;
@@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
+
+ if (!BP_IS_HOLE(db->db_blkptr) &&
+ (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(data, db);
+ } else {
+ ASSERT(arc_released(data));
+ /* XXX why do we need to thaw here? */
+ arc_buf_thaw(data);
+ }
if (parent != dn->dn_dbuf) {
ASSERT(parent && parent->db_data_pending);
@@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
+ wp.wp_type = dn->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dncompress = dn->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+ wp.wp_dnchecksum = dn->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+
if (BP_IS_OLDER(db->db_blkptr, txg))
- dsl_dataset_block_kill(
+ (void) dsl_dataset_block_kill(
os->os_dsl_dataset, db->db_blkptr, zio, tx);
- dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
- dmu_get_replication_level(os, &zb, dn->dn_type), txg,
- db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+ dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+ DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+ data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
/* ARGSUSED */
@@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_impl_t *db = vdb;
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t fill = 0;
int old_size, new_size, i;
+ ASSERT(db->db_blkptr == bp);
+
dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+ new_size = bp_get_dasize(os->os_spa, bp);
- dnode_diduse_space(dn, new_size-old_size);
+ dnode_diduse_space(dn, new_size - old_size);
- if (BP_IS_HOLE(zio->io_bp)) {
+ if (BP_IS_HOLE(bp)) {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ ASSERT3U(bp->blk_fill, ==, 0);
return;
}
+ ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
mutex_enter(&db->db_mtx);
if (db->db_level == 0) {
@@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill = 1;
}
} else {
- blkptr_t *bp = db->db.db_data;
+ blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
- if (BP_IS_HOLE(bp))
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
continue;
- ASSERT3U(BP_GET_LSIZE(bp), ==,
+ ASSERT3U(BP_GET_LSIZE(ibp), ==,
db->db_level == 1 ? dn->dn_datablksz :
(1<<dn->dn_phys->dn_indblkshift));
- fill += bp->blk_fill;
+ fill += ibp->blk_fill;
}
}
- db->db_blkptr->blk_fill = fill;
- BP_SET_TYPE(db->db_blkptr, dn->dn_type);
- BP_SET_LEVEL(db->db_blkptr, db->db_level);
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
- /* We must do this after we've set the bp's type and level */
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ } else {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- dsl_dataset_block_born(ds, zio->io_bp, tx);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
@@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_enter(&db->db_mtx);
drp = &db->db_last_dirty;
- while (*drp != db->db_data_pending)
- drp = &(*drp)->dr_next;
- ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
- ASSERT((*drp)->dr_txg == txg);
- ASSERT((*drp)->dr_next == NULL);
- dr = *drp;
- *drp = NULL;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (db->db_level == 0) {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index d3be6b4ff22e..377efb9d105e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
@@ -42,6 +40,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "DSL props" },
{ byteswap_uint64_array, TRUE, "DSL dataset" },
{ zfs_znode_byteswap, TRUE, "ZFS znode" },
- { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
{ byteswap_uint8_array, FALSE, "ZFS plain file" },
{ zap_byteswap, TRUE, "ZFS directory" },
{ zap_byteswap, TRUE, "ZFS master node" },
@@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "persistent error log" },
{ byteswap_uint8_array, TRUE, "SPA history" },
{ byteswap_uint64_array, TRUE, "SPA history offsets" },
- { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "DSL permissions" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
+ { byteswap_uint8_array, TRUE, "FUID table" },
+ { byteswap_uint64_array, TRUE, "FUID table size" },
+ { zap_byteswap, TRUE, "DSL dataset next clones"},
+ { zap_byteswap, TRUE, "scrub work queue" },
};
int
@@ -115,6 +121,19 @@ dmu_bonus_max(void)
return (DN_MAX_BONUSLEN);
}
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -122,27 +141,27 @@ int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
- int err, count;
dmu_buf_impl_t *db;
+ int error;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
+ dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -161,11 +180,13 @@ static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
+ dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t flags;
int err;
zio_t *zio;
+ hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ if (dn->dn_objset->os_dsl_dataset)
+ dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
+ if (dp && dsl_pool_sync_context(dp))
+ start = gethrtime();
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
@@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
/* wait for async i/o */
err = zio_wait(zio);
+ /* track read overhead when we are in sync context */
+ if (dp && dsl_pool_sync_context(dp))
+ dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
@@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
dnode_rele(dn, FTAG);
}
+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+ uint64_t len = *offset - limit;
+ uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+ uint64_t subchunk =
+ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT(limit <= *offset);
+
+ if (len <= chunk_len) {
+ *offset = limit;
+ return (0);
+ }
+
+ ASSERT(ISP2(subchunk));
+
+ while (*offset > limit) {
+ uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
+ uint64_t delta;
+ int err;
+
+ /* skip over allocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ ASSERT3U(*offset, <=, initial_offset);
+ *offset = P2ALIGN(*offset, subchunk);
+ delta = initial_offset - *offset;
+ if (delta >= chunk_len) {
+ *offset += delta - chunk_len;
+ return (0);
+ }
+ chunk_len -= delta;
+
+ /* skip over unallocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ if (*offset < limit)
+ *offset = limit;
+ ASSERT3U(*offset, <, initial_offset);
+ }
+ return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length, boolean_t free_dnode)
+{
+ dmu_tx_t *tx;
+ uint64_t object_size, start, end, len;
+ boolean_t trunc = (length == DMU_OBJECT_END);
+ int align, err;
+
+ align = 1 << dn->dn_datablkshift;
+ ASSERT(align > 0);
+ object_size = align == 1 ? dn->dn_datablksz :
+ (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+ if (trunc || (end = offset + length) > object_size)
+ end = object_size;
+ if (end <= offset)
+ return (0);
+ length = end - offset;
+
+ while (length) {
+ start = end;
+ err = get_next_chunk(dn, &start, offset);
+ if (err)
+ return (err);
+ len = trunc ? DMU_OBJECT_END : end - start;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+ if (start == 0 && free_dnode) {
+ ASSERT(trunc);
+ dnode_free(dn, tx);
+ }
+
+ length -= end - start;
+
+ dmu_tx_commit(tx);
+ end = start;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_t *tx;
+ int err;
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err != 0)
+ return (err);
+ if (dn->dn_nlevels == 1) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ } else {
+ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
@@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp);
if (err)
- return (err);
+ break;
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
void
@@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
- va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ va = zfs_map_page(pp, S_READ);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
- ppmapout(va);
+ zfs_unmap_page(pp, va);
pp = pp->p_next;
bufoff += PAGESIZE;
}
@@ -620,6 +796,22 @@ typedef struct {
/* ARGSUSED */
static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (!BP_IS_HOLE(bp)) {
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
+}
+
+/* ARGSUSED */
+static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *in = varg;
@@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dmu_buf_impl_t *db = dr->dr_dbuf;
dmu_sync_cb_t *done = in->done;
- if (!BP_IS_HOLE(zio->io_bp)) {
- zio->io_bp->blk_fill = 1;
- BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
- BP_SET_LEVEL(zio->io_bp, 0);
- }
-
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
@@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *in;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
int err;
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
-
dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
@@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- zio = arc_write(pio, os->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
- dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
- txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ wp.wp_type = db->db_dnode->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dnchecksum = db->db_dnode->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+ wp.wp_dncompress = db->db_dnode->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+
+ ASSERT(BP_IS_HOLE(bp));
+
+ zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
+ txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
if (pio) {
zio_nowait(zio);
@@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
}
int
-dmu_get_replication_level(objset_impl_t *os,
- zbookmark_t *zb, dmu_object_type_t ot)
-{
- int ncopies = os->os_copies;
-
- /* If it's the mos, it should have max copies set. */
- ASSERT(zb->zb_objset != 0 ||
- ncopies == spa_max_replication(os->os_spa));
-
- if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
- ncopies++;
- return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
@@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
return (err);
}
- err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
@@ -1018,6 +1193,7 @@ dmu_init(void)
dbuf_init();
dnode_init();
arc_init();
+ l2arc_init();
}
void
@@ -1026,4 +1202,5 @@ dmu_fini(void)
arc_fini();
dnode_fini();
dbuf_fini();
+ l2arc_fini();
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 93168cc8901f..1b9247d66e65 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,7 +54,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
int error = dnode_next_offset(osi->os_meta_dnode,
- B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ DNODE_FIND_HOLE,
+ &offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
if (error == 0)
object = offset >> DNODE_SHIFT;
@@ -139,6 +140,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
@@ -152,7 +154,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
int error;
error = dnode_next_offset(os->os->os_meta_dnode,
- hole, &offset, 0, DNODES_PER_BLOCK, txg);
+ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 378fe8c15bc0..7981e06825c4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,12 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/cred.h>
#include <sys/zfs_context.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
@@ -32,6 +31,7 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
#include <sys/dnode.h>
#include <sys/dbuf.h>
#include <sys/zvol.h>
@@ -40,7 +40,7 @@
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
-
+#include <sys/zfs_ioctl.h>
spa_t *
dmu_objset_spa(objset_t *os)
@@ -131,6 +131,34 @@ copies_changed_cb(void *arg, uint64_t newval)
osi->os_copies = newval;
}
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ osi->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ osi->os_secondary_cache = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -146,8 +174,10 @@ int
dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
objset_impl_t **osip)
{
- objset_impl_t *winner, *osi;
- int i, err, checksum;
+ objset_impl_t *osi;
+ int i, err;
+
+ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
osi->os.os = osi;
@@ -161,18 +191,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zb.zb_object = 0;
zb.zb_level = -1;
zb.zb_blkid = 0;
+ if (DMU_OS_IS_L2CACHEABLE(osi))
+ aflags |= ARC_L2CACHE;
dprintf_bp(osi->os_rootbp, "reading %s", "");
- err = arc_read(NULL, spa, osi->os_rootbp,
- dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ /*
+ * NB: when bprewrite scrub can change the bp,
+ * and this is called from dmu_objset_open_ds_os, the bp
+ * could change, and we'll need a lock.
+ */
+ err = arc_read_nolock(NULL, spa, osi->os_rootbp,
arc_getbuf_func, &osi->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
kmem_free(osi, sizeof (objset_impl_t));
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
return (err);
}
osi->os_phys = osi->os_phys_buf->b_data;
- arc_release(osi->os_phys_buf, &osi->os_phys_buf);
} else {
osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
&osi->os_phys_buf, ARC_BUFC_METADATA);
@@ -183,18 +221,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/*
* Note: the changed_cb will be called once before the register
* func returns, thus changing the checksum/compression from the
- * default (fletcher2/off). Snapshots don't need to know, and
- * registering would complicate clone promotion.
+ * default (fletcher2/off). Snapshots don't need to know about
+ * checksum/compression/copies.
*/
- if (ds && ds->ds_phys->ds_num_children == 0) {
- err = dsl_prop_register(ds, "checksum",
- checksum_changed_cb, osi);
- if (err == 0)
- err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
+ if (ds) {
+ err = dsl_prop_register(ds, "primarycache",
+ primary_cache_changed_cb, osi);
if (err == 0)
- err = dsl_prop_register(ds, "copies",
- copies_changed_cb, osi);
+ err = dsl_prop_register(ds, "secondarycache",
+ secondary_cache_changed_cb, osi);
+ if (!dsl_dataset_is_snapshot(ds)) {
+ if (err == 0)
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "copies",
+ copies_changed_cb, osi);
+ }
if (err) {
VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
&osi->os_phys_buf) == 1);
@@ -206,24 +252,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
osi->os_compress = ZIO_COMPRESS_LZJB;
osi->os_copies = spa_max_replication(spa);
+ osi->os_primary_cache = ZFS_CACHE_ALL;
+ osi->os_secondary_cache = ZFS_CACHE_ALL;
}
- osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
- /*
- * Metadata always gets compressed and checksummed.
- * If the data checksum is multi-bit correctable, and it's not
- * a ZBT-style checksum, then it's suitable for metadata as well.
- * Otherwise, the metadata checksum defaults to fletcher4.
- */
- checksum = osi->os_checksum;
-
- if (zio_checksum_table[checksum].ci_correctable &&
- !zio_checksum_table[checksum].ci_zbt)
- osi->os_md_checksum = checksum;
- else
- osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_md_compress = ZIO_COMPRESS_LZJB;
+ osi->os_zil_header = osi->os_phys->os_zil_header;
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -238,70 +272,118 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
osi->os_meta_dnode = dnode_special_open(osi,
&osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
- if (ds != NULL) {
- winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
- if (winner) {
- dmu_objset_evict(ds, osi);
- osi = winner;
- }
+ /*
+ * We should be the only thread trying to do this because we
+ * have ds_opening_lock
+ */
+ if (ds) {
+ VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
+ dmu_objset_evict));
}
*osip = osi;
return (0);
}
-/* called from zpl */
-int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp)
+static int
+dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
{
- dsl_dataset_t *ds;
- int err;
- objset_t *os;
objset_impl_t *osi;
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- err = dsl_dataset_open(name, mode, os, &ds);
- if (err) {
- kmem_free(os, sizeof (objset_t));
- return (err);
- }
-
+ mutex_enter(&ds->ds_opening_lock);
osi = dsl_dataset_get_user_ptr(ds);
if (osi == NULL) {
+ int err;
+
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
ds, &ds->ds_phys->ds_bp, &osi);
if (err) {
- dsl_dataset_close(ds, mode, os);
- kmem_free(os, sizeof (objset_t));
+ mutex_exit(&ds->ds_opening_lock);
return (err);
}
}
+ mutex_exit(&ds->ds_opening_lock);
os->os = osi;
- os->os_mode = mode;
+ os->os_mode = DS_MODE_NOHOLD;
- if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
- dmu_objset_close(os);
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
return (EINVAL);
- }
- *osp = os;
return (0);
}
+int
+dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+{
+ objset_t *os;
+ int err;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dmu_objset_open_ds_os(ds, os, type);
+ if (err)
+ kmem_free(os, sizeof (objset_t));
+ else
+ *osp = os;
+ return (err);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ objset_t *os;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
+ DS_MODE_TYPE(mode) == DS_MODE_OWNER);
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+ err = dsl_dataset_hold(name, os, &ds);
+ else
+ err = dsl_dataset_own(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ err = dmu_objset_open_ds_os(ds, os, type);
+ if (err) {
+ if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+ dsl_dataset_rele(ds, os);
+ else
+ dsl_dataset_disown(ds, os);
+ kmem_free(os, sizeof (objset_t));
+ } else {
+ os->os_mode = mode;
+ *osp = os;
+ }
+ return (err);
+}
+
void
dmu_objset_close(objset_t *os)
{
- dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
+ DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
+ DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+
+ if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
+ dsl_dataset_rele(os->os->os_dsl_dataset, os);
+ else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
+ dsl_dataset_disown(os->os->os_dsl_dataset, os);
kmem_free(os, sizeof (objset_t));
}
int
-dmu_objset_evict_dbufs(objset_t *os, int try)
+dmu_objset_evict_dbufs(objset_t *os)
{
objset_impl_t *osi = os->os;
dnode_t *dn;
@@ -319,34 +401,25 @@ dmu_objset_evict_dbufs(objset_t *os, int try)
* skip.
*/
for (dn = list_head(&osi->os_dnodes);
- dn && refcount_is_zero(&dn->dn_holds);
+ dn && !dnode_add_ref(dn, FTAG);
dn = list_next(&osi->os_dnodes, dn))
continue;
- if (dn)
- dnode_add_ref(dn, FTAG);
while (dn) {
dnode_t *next_dn = dn;
do {
next_dn = list_next(&osi->os_dnodes, next_dn);
- } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
- if (next_dn)
- dnode_add_ref(next_dn, FTAG);
+ } while (next_dn && !dnode_add_ref(next_dn, FTAG));
mutex_exit(&osi->os_lock);
- if (dnode_evict_dbufs(dn, try)) {
- dnode_rele(dn, FTAG);
- if (next_dn)
- dnode_rele(next_dn, FTAG);
- return (1);
- }
+ dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&osi->os_lock);
dn = next_dn;
}
mutex_exit(&osi->os_lock);
- return (0);
+ return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
}
void
@@ -361,13 +434,19 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
}
- if (ds && ds->ds_phys->ds_num_children == 0) {
- VERIFY(0 == dsl_prop_unregister(ds, "checksum",
- checksum_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "compression",
- compression_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "copies",
- copies_changed_cb, osi));
+ if (ds) {
+ if (!dsl_dataset_is_snapshot(ds)) {
+ VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "copies",
+ copies_changed_cb, osi));
+ }
+ VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
+ primary_cache_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
+ secondary_cache_changed_cb, osi));
}
/*
@@ -375,7 +454,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
* nothing can be added to the list at this point.
*/
os.os = osi;
- (void) dmu_objset_evict_dbufs(&os, 0);
+ (void) dmu_objset_evict_dbufs(&os);
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
@@ -387,6 +466,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock);
+ mutex_destroy(&osi->os_user_ptr_lock);
kmem_free(osi, sizeof (objset_impl_t));
}
@@ -399,7 +479,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
+ if (ds)
+ mutex_enter(&ds->ds_opening_lock);
VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+ if (ds)
+ mutex_exit(&ds->ds_opening_lock);
mdn = osi->os_meta_dnode;
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -443,14 +527,15 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
}
struct oscarg {
- void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
void *userarg;
dsl_dataset_t *clone_parent;
const char *lastname;
dmu_objset_type_t type;
+ uint64_t flags;
};
-/* ARGSUSED */
+/*ARGSUSED*/
static int
dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -478,11 +563,12 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (oa->clone_parent->ds_phys->ds_num_children == 0)
return (EINVAL);
}
+
return (0);
}
static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct oscarg *oa = arg2;
@@ -493,10 +579,9 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_parent, tx);
+ oa->clone_parent, oa->flags, cr, tx);
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
bp = dsl_dataset_get_blkptr(ds);
if (BP_IS_HOLE(bp)) {
objset_impl_t *osi;
@@ -506,15 +591,19 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
ds, bp, oa->type, tx);
if (oa->userfunc)
- oa->userfunc(&osi->os, oa->userarg, tx);
+ oa->userfunc(&osi->os, oa->userarg, cr, tx);
}
- dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+ spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", dsobj);
+
+ dsl_dataset_rele(ds, FTAG);
}
int
dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent,
- void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+ objset_t *clone_parent, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
{
dsl_dir_t *pdd;
const char *tail;
@@ -536,6 +625,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
oa.userarg = arg;
oa.lastname = tail;
oa.type = type;
+ oa.flags = flags;
+
if (clone_parent != NULL) {
/*
* You can't clone to a different type.
@@ -564,33 +655,47 @@ dmu_objset_destroy(const char *name)
* It would be nicer to do this in dsl_dataset_destroy_sync(),
* but the replay log objset is modified in open context.
*/
- error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ error = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
if (error == 0) {
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
zil_destroy(dmu_objset_zil(os), B_FALSE);
- dmu_objset_close(os);
+
+ error = dsl_dataset_destroy(ds, os);
+ /*
+ * dsl_dataset_destroy() closes the ds.
+ */
+ kmem_free(os, sizeof (objset_t));
}
- return (dsl_dataset_destroy(name));
+ return (error);
}
+/*
+ * This will close the objset.
+ */
int
-dmu_objset_rollback(const char *name)
+dmu_objset_rollback(objset_t *os)
{
int err;
- objset_t *os;
+ dsl_dataset_t *ds;
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err == 0) {
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0)
- zil_resume(dmu_objset_zil(os));
- if (err == 0) {
- /* XXX uncache everything? */
- err = dsl_dataset_rollback(os->os->os_dsl_dataset);
- }
+ ds = os->os->os_dsl_dataset;
+
+ if (!dsl_dataset_tryown(ds, TRUE, os)) {
dmu_objset_close(os);
+ return (EBUSY);
}
+
+ err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+
+ /*
+ * NB: we close the objset manually because the rollback
+ * actually implicitly called dmu_objset_evict(), thus freeing
+ * the objset_impl_t.
+ */
+ dsl_dataset_disown(ds, os);
+ kmem_free(os, sizeof (objset_t));
return (err);
}
@@ -598,6 +703,13 @@ struct snaparg {
dsl_sync_task_group_t *dstg;
char *snapname;
char failed[MAXPATHLEN];
+ boolean_t checkperms;
+ list_t objsets;
+};
+
+struct osnode {
+ list_node_t node;
+ objset_t *os;
};
static int
@@ -605,20 +717,25 @@ dmu_objset_snapshot_one(char *name, void *arg)
{
struct snaparg *sn = arg;
objset_t *os;
- dmu_objset_stats_t stat;
int err;
(void) strcpy(sn->failed, name);
- err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ /*
+ * Check permissions only when requested. This only applies when
+ * doing a recursive snapshot. The permission checks for the starting
+ * dataset have already been performed in zfs_secpolicy_snapshot()
+ */
+ if (sn->checkperms == B_TRUE &&
+ (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+ return (err);
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
if (err != 0)
return (err);
- /*
- * If the objset is in an inconsistent state, return busy.
- */
- dmu_objset_fast_stat(os, &stat);
- if (stat.dds_inconsistent) {
+ /* If the objset is in an inconsistent state, return busy */
+ if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
dmu_objset_close(os);
return (EBUSY);
}
@@ -630,8 +747,13 @@ dmu_objset_snapshot_one(char *name, void *arg)
*/
err = zil_suspend(dmu_objset_zil(os));
if (err == 0) {
+ struct osnode *osn;
dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
- dsl_dataset_snapshot_sync, os, sn->snapname, 3);
+ dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
+ sn->snapname, 3);
+ osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
+ osn->os = os;
+ list_insert_tail(&sn->objsets, osn);
} else {
dmu_objset_close(os);
}
@@ -643,31 +765,28 @@ int
dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
{
dsl_sync_task_t *dst;
+ struct osnode *osn;
struct snaparg sn = { 0 };
- char *cp;
spa_t *spa;
int err;
(void) strcpy(sn.failed, fsname);
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
+ err = spa_open(fsname, &spa, FTAG);
if (err)
return (err);
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
+ list_create(&sn.objsets, sizeof (struct osnode),
+ offsetof(struct osnode, node));
if (recursive) {
+ sn.checkperms = B_TRUE;
err = dmu_objset_find(fsname,
dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
} else {
+ sn.checkperms = B_FALSE;
err = dmu_objset_snapshot_one(fsname, &sn);
}
@@ -678,13 +797,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
- objset_t *os = dst->dst_arg1;
+ dsl_dataset_t *ds = dst->dst_arg1;
if (dst->dst_err)
- dmu_objset_name(os, sn.failed);
- zil_resume(dmu_objset_zil(os));
- dmu_objset_close(os);
+ dsl_dataset_name(ds, sn.failed);
}
+
out:
+ while (osn = list_head(&sn.objsets)) {
+ list_remove(&sn.objsets, osn);
+ zil_resume(dmu_objset_zil(osn->os));
+ dmu_objset_close(osn->os);
+ kmem_free(osn, sizeof (struct osnode));
+ }
+ list_destroy(&sn.objsets);
+
if (err)
(void) strcpy(fsname, sn.failed);
dsl_sync_task_group_destroy(sn.dstg);
@@ -717,39 +843,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
static void
ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
objset_impl_t *os = arg;
- blkptr_t *bp = os->os_rootbp;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
- int i;
+
+ ASSERT(bp == os->os_rootbp);
+ ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
/*
* Update rootbp fill count.
*/
bp->blk_fill = 1; /* count the meta-dnode */
- for (i = 0; i < dnp->dn_nblkptr; i++)
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
-}
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
- objset_impl_t *os = arg;
-
- ASSERT3U(zio->io_error, ==, 0);
-
- BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
- BP_SET_LEVEL(zio->io_bp, 0);
-
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
- BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ } else {
if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, NULL, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
- os->os_synctx);
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, zio, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
}
- arc_release(os->os_phys_buf, &os->os_phys_buf);
}
/* called from dsl */
@@ -758,10 +875,10 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
list_t *list;
dbuf_dirty_record_t *dr;
- int zio_flags;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -783,19 +900,24 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
*/
zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
zb.zb_object = 0;
- zb.zb_level = -1;
+ zb.zb_level = -1; /* for block ordering; it's level 0 on disk */
zb.zb_blkid = 0;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+
+ wp.wp_type = DMU_OT_OBJSET;
+ wp.wp_level = 0; /* on-disk BP level; see above */
+ wp.wp_copies = os->os_copies;
+ wp.wp_oschecksum = os->os_checksum;
+ wp.wp_oscompress = os->os_compress;
+
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
os->os_rootbp, pio, tx);
- zio = arc_write(pio, os->os_spa, os->os_md_checksum,
- os->os_md_compress,
- dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+ }
+
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+ zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync meta-dnode - the parent IO for the sync is the root block
@@ -819,6 +941,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
* Free intent log blocks up to this tx.
*/
zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
zio_nowait(zio);
}
@@ -867,8 +990,23 @@ dmu_objset_is_snapshot(objset_t *os)
}
int
+dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
+ boolean_t *conflict)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ uint64_t ignored;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
+ real, maxlen, conflict));
+}
+
+int
dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
- uint64_t *idp, uint64_t *offp)
+ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
{
dsl_dataset_t *ds = os->os->os_dsl_dataset;
zap_cursor_t cursor;
@@ -894,6 +1032,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
(void) strcpy(name, attr.za_name);
if (idp)
*idp = attr.za_first_integer;
+ if (case_conflict)
+ *case_conflict = attr.za_normalization_conflict;
zap_cursor_advance(&cursor);
*offp = zap_cursor_serialize(&cursor);
zap_cursor_fini(&cursor);
@@ -938,48 +1078,80 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
return (0);
}
+struct findarg {
+ int (*func)(char *, void *);
+ void *arg;
+};
+
+/* ARGSUSED */
+static int
+findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct findarg *fa = arg;
+ return (fa->func((char *)dsname, fa->arg));
+}
+
/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * Perhaps change all callers to use dmu_objset_find_spa()?
*/
int
dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
{
+ struct findarg fa;
+ fa.func = func;
+ fa.arg = arg;
+ return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+}
+
+/*
+ * Find all objsets under name, call func on each
+ */
+int
+dmu_objset_find_spa(spa_t *spa, const char *name,
+ int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+{
dsl_dir_t *dd;
- objset_t *os;
- uint64_t snapobj;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
char *child;
- int do_self, err;
+ uint64_t thisobj;
+ int err;
- err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (name == NULL)
+ name = spa_name(spa);
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
if (err)
return (err);
- /* NB: the $MOS dir doesn't have a head dataset */
- do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_close(dd, FTAG);
+ return (0);
+ }
+
+ thisobj = dd->dd_phys->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ dp = dd->dd_pool;
/*
* Iterate over all children.
*/
if (flags & DS_FIND_CHILDREN) {
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
dd->dd_phys->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT(attr->za_integer_length == sizeof (uint64_t));
ASSERT(attr->za_num_integers == 1);
- /*
- * No separating '/' because parent's name ends in /.
- */
child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
+ (void) strcpy(child, name);
(void) strcat(child, "/");
(void) strcat(child, attr->za_name);
- err = dmu_objset_find(child, func, arg, flags);
+ err = dmu_objset_find_spa(spa, child, func, arg, flags);
kmem_free(child, MAXPATHLEN);
if (err)
break;
@@ -996,30 +1168,36 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
/*
* Iterate over all snapshots.
*/
- if ((flags & DS_FIND_SNAPSHOTS) &&
- dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
-
- snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
- dmu_objset_close(os);
-
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
- zap_cursor_retrieve(&zc, attr) == 0;
- (void) zap_cursor_advance(&zc)) {
- ASSERT(attr->za_integer_length == sizeof (uint64_t));
- ASSERT(attr->za_num_integers == 1);
+ if (flags & DS_FIND_SNAPSHOTS) {
+ if (!dsl_pool_sync_context(dp))
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+ if (!dsl_pool_sync_context(dp))
+ rw_exit(&dp->dp_config_rwlock);
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
- (void) strcat(child, "@");
- (void) strcat(child, attr->za_name);
- err = func(child, arg);
- kmem_free(child, MAXPATHLEN);
- if (err)
- break;
+ if (err == 0) {
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr->za_integer_length ==
+ sizeof (uint64_t));
+ ASSERT(attr->za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ (void) strcpy(child, name);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr->za_name);
+ err = func(spa, attr->za_first_integer,
+ child, arg);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
}
- zap_cursor_fini(&zc);
}
dsl_dir_close(dd, FTAG);
@@ -1031,7 +1209,20 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
/*
* Apply to self if appropriate.
*/
- if (do_self)
- err = func(name, arg);
+ err = func(spa, thisobj, name, arg);
return (err);
}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ return (os->os->os_user_ptr);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 3e55dc301620..1294581a7133 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,10 +41,13 @@
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+static char *dmu_recv_tag = "dmu_recv_tag";
+
struct backuparg {
dmu_replay_record_t *drr;
kthread_t *td;
struct file *fp;
+ offset_t *off;
objset_t *os;
zio_cksum_t zc;
int err;
@@ -77,6 +80,7 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
ba->err = EOPNOTSUPP;
#endif
+ *ba->off += len;
return (ba->err);
}
@@ -179,7 +183,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
void *data = bc->bc_data;
int err = 0;
- if (SIGPENDING(curthread))
+ if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
ASSERT(data || bp == NULL);
@@ -215,10 +219,9 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
zb.zb_object = object;
zb.zb_level = level;
zb.zb_blkid = blkid;
- (void) arc_read(NULL, spa, bp,
- dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
+ (void) arc_read_nolock(NULL, spa, bp,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
if (abuf) {
err = dump_data(ba, type, object, blkid * blksz,
@@ -236,13 +239,15 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
}
int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+ struct file *fp, offset_t *off)
{
dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
dmu_replay_record_t *drr;
struct backuparg ba;
int err;
+ uint64_t fromtxg = 0;
/* tosnap must be a snapshot */
if (ds->ds_phys->ds_next_snap_obj == 0)
@@ -250,26 +255,55 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
/* fromsnap must be an earlier snapshot from the same fs as tosnap */
if (fromds && (ds->ds_dir != fromds->ds_dir ||
- fromds->ds_phys->ds_creation_txg >=
- ds->ds_phys->ds_creation_txg))
+ fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
return (EXDEV);
+ if (fromorigin) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (fromsnap)
+ return (EINVAL);
+
+ if (dsl_dir_is_clone(ds->ds_dir)) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ return (err);
+ } else {
+ fromorigin = B_FALSE;
+ }
+ }
+
+
drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
drr->drr_type = DRR_BEGIN;
drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+ drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ if (fromorigin)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+
if (fromds)
drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+ if (fromds)
+ fromtxg = fromds->ds_phys->ds_creation_txg;
+ if (fromorigin)
+ dsl_dataset_rele(fromds, FTAG);
+
ba.drr = drr;
ba.td = curthread;
ba.fp = fp;
ba.os = tosnap;
+ ba.off = off;
ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
@@ -277,8 +311,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
return (ba.err);
}
- err = traverse_dsl_dataset(ds,
- fromds ? fromds->ds_phys->ds_creation_txg : 0,
+ err = traverse_dsl_dataset(ds, fromtxg,
ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
backup_cb, &ba);
@@ -303,164 +336,384 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
return (0);
}
-struct restorearg {
- int err;
- int byteswap;
- kthread_t *td;
- struct file *fp;
- char *buf;
- uint64_t voff;
- int buflen; /* number of valid bytes in buf */
- int bufoff; /* next offset to read */
- int bufsize; /* amount of memory allocated for buf */
- zio_cksum_t zc;
+struct recvbeginsyncarg {
+ const char *tofs;
+ const char *tosnap;
+ dsl_dataset_t *origin;
+ uint64_t fromguid;
+ dmu_objset_type_t type;
+ void *tag;
+ boolean_t force;
+ uint64_t dsflags;
+ char clonelastname[MAXNAMELEN];
+ dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
};
+static dsl_dataset_t *
+recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
+ cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds;
+
+ /* This should always work, since we just created it */
+ /* XXX - create should return an owned ds */
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+ DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
+
+ if (type != DMU_OST_NONE) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ ds, &ds->ds_phys->ds_bp, type, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
+ dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+
+ return (ds);
+}
+
/* ARGSUSED */
static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
- struct drr_begin *drrb = arg2;
- const char *snapname;
- int err;
+ dsl_dir_t *dd = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t val;
+ int err;
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
- return (ENODEV);
- /* must not have any changes since most recent snapshot */
- if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (ETXTBSY);
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
- /* new snapshot name must not exist */
- snapname = strrchr(drrb->drr_toname, '@');
- if (snapname == NULL)
- return (EEXIST);
+ if (rbsa->origin) {
+ /* make sure it's a snap in the same pool */
+ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
+ return (EXDEV);
+ if (rbsa->origin->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ }
- snapname++;
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
+ return (0);
+}
+
+static void
+recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+ uint64_t dsobj;
+
+ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
+ rbsa->origin, flags, cr, tx);
+
+ rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+ rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+}
+
+static int
+recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ int err;
+
+ /* must be a head ds */
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ return (EINVAL);
+
+ /* must not be a clone ds */
+ if (dsl_dir_is_clone(ds->ds_dir))
+ return (EINVAL);
+
+ err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
+ if (err)
return (err);
+ if (rbsa->origin) {
+ /* make sure it's a snap in the same pool */
+ if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
+ return (EXDEV);
+ if (rbsa->origin->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ }
+
return (0);
}
-/* ARGSUSED */
static void
-replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ struct recvbeginsyncarg *rbsa = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+ uint64_t dsobj;
+
+ /*
+ * NB: caller must provide an extra hold on the dsl_dir_t, so it
+ * won't go away when dsl_dataset_destroy_sync() closes the
+ * dataset.
+ */
+ dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
+
+ dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
+
+ rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+ rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
}
/* ARGSUSED */
static int
-replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- char *cp;
- uint64_t val;
+ dsl_dataset_t *ds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
int err;
+ uint64_t val;
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
- strrchr(drrb->drr_toname, '/') + 1,
- sizeof (uint64_t), 1, &val);
- *cp = '@';
+ /* must not have any changes since most recent snapshot */
+ if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
+ return (ETXTBSY);
+
+ /* must already be a snapshot of this fs */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ return (ENODEV);
+
+ /* most recent snapshot must match fromguid */
+ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
+ return (ENODEV);
+ /* temporary clone name must not exist */
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_dir->dd_phys->dd_child_dir_zapobj,
+ rbsa->clonelastname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
if (err != ENOENT)
- return (err ? err : EEXIST);
+ return (err);
+ /* new snapshot name must not exist */
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
return (0);
}
+/* ARGSUSED */
static void
-replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- char *cp;
- dsl_dataset_t *ds;
+ dsl_dataset_t *ohds = arg1;
+ struct recvbeginsyncarg *rbsa = arg2;
+ dsl_pool_t *dp = ohds->ds_dir->dd_pool;
+ dsl_dataset_t *ods, *cds;
+ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
- NULL, tx);
- *cp = '@';
+ /* create the temporary clone */
+ VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
+ FTAG, &ods));
+ dsobj = dsl_dataset_create_sync(ohds->ds_dir,
+ rbsa->clonelastname, ods, flags, cr, tx);
+ dsl_dataset_rele(ods, FTAG);
+
+ /* open the temporary clone */
+ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+ DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_EXCLUSIVE, FTAG, &ds));
+ /* copy the refquota from the target fs to the clone */
+ if (ohds->ds_quota > 0)
+ dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
- (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
+ rbsa->ds = cds;
+
+ spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+ dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+}
+
+/* ARGSUSED */
+static void
+recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
+ ds->ds_object);
}
-static int
-replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+ boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
+ int err = 0;
+ boolean_t byteswap;
+ struct recvbeginsyncarg rbsa;
+ uint64_t version;
+ int flags;
+ dsl_dataset_t *ds;
- /* XXX verify that drr_toname is in dd */
+ if (drrb->drr_magic == DMU_BACKUP_MAGIC)
+ byteswap = FALSE;
+ else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+ byteswap = TRUE;
+ else
+ return (EINVAL);
- snapname = strchr(drrb->drr_toname, '@');
- if (snapname == NULL)
+ rbsa.tofs = tofs;
+ rbsa.tosnap = tosnap;
+ rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+ rbsa.fromguid = drrb->drr_fromguid;
+ rbsa.type = drrb->drr_type;
+ rbsa.tag = FTAG;
+ rbsa.dsflags = 0;
+ version = drrb->drr_version;
+ flags = drrb->drr_flags;
+
+ if (byteswap) {
+ rbsa.type = BSWAP_32(rbsa.type);
+ rbsa.fromguid = BSWAP_64(rbsa.fromguid);
+ version = BSWAP_64(version);
+ flags = BSWAP_32(flags);
+ }
+
+ if (version != DMU_BACKUP_STREAM_VERSION ||
+ rbsa.type >= DMU_OST_NUMTYPES ||
+ ((flags & DRR_FLAG_CLONE) && origin == NULL))
return (EINVAL);
- snapname++;
- return (dsl_dataset_snapshot_check(os, snapname, tx));
-}
+ if (flags & DRR_FLAG_CI_DATA)
+ rbsa.dsflags = DS_FLAG_CI_DATASET;
-static void
-replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
- dsl_dataset_t *ds, *hds;
+ bzero(drc, sizeof (dmu_recv_cookie_t));
+ drc->drc_drrb = drrb;
+ drc->drc_tosnap = tosnap;
+ drc->drc_force = force;
- snapname = strchr(drrb->drr_toname, '@') + 1;
+ /*
+ * Process the begin in syncing context.
+ */
+ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
+ /* offline incremental receive */
+ err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
+ if (err)
+ return (err);
- dsl_dataset_snapshot_sync(os, snapname, tx);
+ /*
+ * Only do the rollback if the most recent snapshot
+ * matches the incremental source
+ */
+ if (force) {
+ if (ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_guid !=
+ rbsa.fromguid) {
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (ENODEV);
+ }
+ (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ }
+ rbsa.force = B_FALSE;
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_incremental_check,
+ recv_offline_incremental_sync, ds, &rbsa, 1);
+ if (err) {
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (err);
+ }
+ drc->drc_logical_ds = drc->drc_real_ds = ds;
+ } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
+ /* online incremental receive */
- /* set snapshot's creation time and guid */
- hds = os->os->os_dsl_dataset;
- VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
- hds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds));
+ /* tmp clone name is: tofs/%tosnap" */
+ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
+ "%%%s", tosnap);
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
- ds->ds_phys->ds_guid = drrb->drr_toguid;
- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ /* open the dataset we are logically receiving into */
+ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+ if (err)
+ return (err);
- dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+ rbsa.force = force;
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_incremental_check,
+ recv_online_incremental_sync, ds, &rbsa, 5);
+ if (err) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (err);
+ }
+ drc->drc_logical_ds = ds;
+ drc->drc_real_ds = rbsa.ds;
+ } else {
+ /* create new fs -- full backup or clone */
+ dsl_dir_t *dd = NULL;
+ const char *tail;
+
+ err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ if (!force) {
+ dsl_dir_close(dd, FTAG);
+ return (EEXIST);
+ }
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_own_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj,
+ DS_MODE_INCONSISTENT, FTAG, &ds);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+
+ dsl_dataset_make_exclusive(ds, FTAG);
+ err = dsl_sync_task_do(dd->dd_pool,
+ recv_full_existing_check,
+ recv_full_existing_sync, ds, &rbsa, 5);
+ dsl_dataset_disown(ds, FTAG);
+ } else {
+ err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
+ recv_full_sync, dd, &rbsa, 5);
+ }
+ dsl_dir_close(dd, FTAG);
+ if (err)
+ return (err);
+ drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
+ drc->drc_newfs = B_TRUE;
+ }
- dmu_buf_will_dirty(hds->ds_dbuf, tx);
- hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ return (0);
}
+struct restorearg {
+ int err;
+ int byteswap;
+ kthread_t *td;
+ struct file *fp;
+ char *buf;
+ uint64_t voff;
+ int bufsize; /* amount of memory allocated for buf */
+ zio_cksum_t cksum;
+};
+
static int
restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
{
@@ -491,37 +744,31 @@ static void *
restore_read(struct restorearg *ra, int len)
{
void *rv;
+ int done = 0;
/* some things will require 8-byte alignment, so everything must */
ASSERT3U(len % 8, ==, 0);
- while (ra->buflen - ra->bufoff < len) {
+ while (done < len) {
int resid;
- int leftover = ra->buflen - ra->bufoff;
- (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+ ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
+ len - done, ra->voff, &resid);
- ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
- ra->bufsize - leftover, ra->voff, &resid);
-
- ra->voff += ra->bufsize - leftover - resid;
- ra->buflen = ra->bufsize - resid;
- ra->bufoff = 0;
- if (resid == ra->bufsize - leftover)
+ if (resid == len - done)
ra->err = EINVAL;
+ ra->voff += len - done - resid;
+ done = len - resid;
if (ra->err)
return (NULL);
- /* Could compute checksum here? */
}
- ASSERT3U(ra->bufoff % 8, ==, 0);
- ASSERT3U(ra->buflen - ra->bufoff, >=, len);
- rv = ra->buf + ra->bufoff;
- ra->bufoff += len;
+ ASSERT3U(done, ==, len);
+ rv = ra->buf;
if (ra->byteswap)
- fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+ fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
else
- fletcher_4_incremental_native(rv, len, &ra->zc);
+ fletcher_4_incremental_native(rv, len, &ra->cksum);
return (rv);
}
@@ -531,12 +778,14 @@ backup_byteswap(dmu_replay_record_t *drr)
#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
switch (drr->drr_type) {
case DRR_BEGIN:
DO64(drr_begin.drr_magic);
DO64(drr_begin.drr_version);
DO64(drr_begin.drr_creation_time);
DO32(drr_begin.drr_type);
+ DO32(drr_begin.drr_flags);
DO64(drr_begin.drr_toguid);
DO64(drr_begin.drr_fromguid);
break;
@@ -643,13 +892,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (data == NULL) {
dmu_tx_commit(tx);
return (ra->err);
}
- bcopy(data, db->db_data, db->db_size);
+ bcopy(data, db->db_data, drro->drr_bonuslen);
if (ra->byteswap) {
dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
drro->drr_bonuslen);
@@ -673,23 +922,14 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx;
int err;
if (dmu_object_info(os, obj, NULL) != 0)
continue;
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
+ err = dmu_free_object(os, obj);
+ if (err)
return (err);
- }
- err = dmu_object_free(os, obj, tx);
- dmu_tx_commit(tx);
- if (err && err != ENOENT)
- return (EINVAL);
}
return (0);
}
@@ -735,7 +975,6 @@ static int
restore_free(struct restorearg *ra, objset_t *os,
struct drr_free *drrf)
{
- dmu_tx_t *tx;
int err;
if (drrf->drr_length != -1ULL &&
@@ -745,66 +984,65 @@ restore_free(struct restorearg *ra, objset_t *os,
if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
return (EINVAL);
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_free(tx, drrf->drr_object,
+ err = dmu_free_long_range(os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_free_range(os, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length, tx);
- dmu_tx_commit(tx);
return (err);
}
+void
+dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
+{
+ if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
+ /*
+ * online incremental or new fs: destroy the fs (which
+ * may be a clone) that we created
+ */
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (drc->drc_real_ds != drc->drc_logical_ds)
+ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+ } else {
+ /*
+ * offline incremental: rollback to most recent snapshot.
+ */
+ (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
+ dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
+ }
+}
+
+/*
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
int
-dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, struct file *fp, uint64_t voffset)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
{
kthread_t *td = curthread;
- struct restorearg ra;
+ struct restorearg ra = { 0 };
dmu_replay_record_t *drr;
- char *cp;
- objset_t *os = NULL;
- zio_cksum_t pzc;
-
- bzero(&ra, sizeof (ra));
- ra.td = td;
- ra.fp = fp;
- ra.voff = voffset;
- ra.bufsize = 1<<20;
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+ objset_t *os;
+ zio_cksum_t pcksum;
- if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
- ra.byteswap = FALSE;
- } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
ra.byteswap = TRUE;
- } else {
- ra.err = EINVAL;
- goto out;
- }
- /*
- * NB: this assumes that struct drr_begin will be the largest in
- * dmu_replay_record_t's drr_u, and thus we don't need to pad it
- * with zeros to make it the same length as we wrote out.
- */
- ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
- ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
- ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
- if (ra.byteswap) {
- fletcher_4_incremental_byteswap(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
- } else {
- fletcher_4_incremental_native(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
+ {
+ /* compute checksum of drr_begin record */
+ dmu_replay_record_t *drr;
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin = *drc->drc_drrb;
+ if (ra.byteswap) {
+ fletcher_4_incremental_byteswap(drr,
+ sizeof (dmu_replay_record_t), &ra.cksum);
+ } else {
+ fletcher_4_incremental_native(drr,
+ sizeof (dmu_replay_record_t), &ra.cksum);
+ }
+ kmem_free(drr, sizeof (dmu_replay_record_t));
}
- (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
if (ra.byteswap) {
+ struct drr_begin *drrb = drc->drc_drrb;
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
drrb->drr_version = BSWAP_64(drrb->drr_version);
drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
@@ -813,94 +1051,30 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
}
- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-
- if (drrb->drr_version != DMU_BACKUP_VERSION ||
- drrb->drr_type >= DMU_OST_NUMTYPES ||
- strchr(drrb->drr_toname, '@') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- /*
- * Process the begin in syncing context.
- */
- if (drrb->drr_fromguid) {
- /* incremental backup */
- dsl_dataset_t *ds = NULL;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
- *cp = '@';
- if (ra.err)
- goto out;
-
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- drrb->drr_fromguid) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- kmem_free(ra.buf, ra.bufsize);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds);
- }
- ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- replay_incremental_check, replay_incremental_sync,
- ds, drrb, 1);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full backup */
- dsl_dir_t *dd = NULL;
- const char *tail;
-
- /* can't restore full backup into topmost fs, for now */
- if (strrchr(drrb->drr_toname, '/') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
- *cp = '@';
- if (ra.err)
- goto out;
- if (tail == NULL) {
- ra.err = EEXIST;
- goto out;
- }
+ ra.td = td;
+ ra.fp = fp;
+ ra.voff = *voffp;
+ ra.bufsize = 1<<20;
+ ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
- ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
- replay_full_sync, dd, drrb, 5);
- dsl_dir_close(dd, FTAG);
- }
- if (ra.err)
- goto out;
+ /* these were verified in dmu_recv_begin */
+ ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+ ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
/*
* Open the objset we are modifying.
*/
+ VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
- DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
- *cp = '@';
- ASSERT3U(ra.err, ==, 0);
+ ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
/*
* Read records and process them.
*/
- pzc = ra.zc;
+ pcksum = ra.cksum;
while (ra.err == 0 &&
NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
- if (SIGPENDING(td)) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
ra.err = EINTR;
goto out;
}
@@ -947,63 +1121,116 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
* value, because the stored checksum is of
* everything before the DRR_END record.
*/
- if (drre.drr_checksum.zc_word[0] != 0 &&
- !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
+ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
ra.err = ECKSUM;
- goto out;
- }
-
- ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
- ds_dir->dd_pool, replay_end_check, replay_end_sync,
- os, drrb, 3);
goto out;
}
default:
ra.err = EINVAL;
goto out;
}
- pzc = ra.zc;
+ pcksum = ra.cksum;
}
+ ASSERT(ra.err != 0);
out:
- if (os)
- dmu_objset_close(os);
+ dmu_objset_close(os);
- /*
- * Make sure we don't rollback/destroy unless we actually
- * processed the begin properly. 'os' will only be set if this
- * is the case.
- */
- if (ra.err && os && tosnap && strchr(tosnap, '@')) {
+ if (ra.err != 0) {
/*
* rollback or destroy what we created, so we don't
* leave it in the restoring state.
*/
- dsl_dataset_t *ds;
- int err;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- err = dsl_dataset_open(tosnap,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err == 0) {
- txg_wait_synced(ds->ds_dir->dd_pool, 0);
- if (drrb->drr_fromguid) {
- /* incremental: rollback to most recent snap */
- (void) dsl_dataset_rollback(ds);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full: destroy whole fs */
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- (void) dsl_dataset_destroy(tosnap);
- }
- }
- *cp = '@';
+ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
+ dmu_recv_abort_cleanup(drc);
}
kmem_free(ra.buf, ra.bufsize);
- if (sizep)
- *sizep = ra.voff;
+ *voffp = ra.voff;
return (ra.err);
}
+
+struct recvendsyncarg {
+ char *tosnap;
+ uint64_t creation_time;
+ uint64_t toguid;
+};
+
+static int
+recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvendsyncarg *resa = arg2;
+
+ return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+}
+
+static void
+recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct recvendsyncarg *resa = arg2;
+
+ dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
+ ds->ds_prev->ds_phys->ds_guid = resa->toguid;
+ ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+ struct recvendsyncarg resa;
+ dsl_dataset_t *ds = drc->drc_logical_ds;
+ int err;
+
+ /*
+ * XXX hack; seems the ds is still dirty and
+ * dsl_pool_zil_clean() expects it to have a ds_user_ptr
+ * (and zil), but clone_swap() can close it.
+ */
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+ if (ds != drc->drc_real_ds) {
+ /* we are doing an online recv */
+ if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+ err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+ drc->drc_force);
+ if (err)
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ } else {
+ err = EBUSY;
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ }
+ /* dsl_dataset_destroy() will disown the ds */
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ if (err)
+ return (err);
+ }
+
+ resa.creation_time = drc->drc_drrb->drr_creation_time;
+ resa.toguid = drc->drc_drrb->drr_toguid;
+ resa.tosnap = drc->drc_tosnap;
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ recv_end_check, recv_end_sync, ds, &resa, 3);
+ if (err) {
+ if (drc->drc_newfs) {
+ ASSERT(ds == drc->drc_real_ds);
+ (void) dsl_dataset_destroy(ds, dmu_recv_tag);
+ return (err);
+ } else {
+ (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+ }
+ }
+
+ /* release the hold from dmu_recv_begin */
+ dsl_dataset_disown(ds, dmu_recv_tag);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 3d2bc3e47678..43bf82e7a682 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
+#include <sys/zvol.h>
#define BP_SPAN_SHIFT(level, width) ((level) * (width))
@@ -261,6 +262,16 @@ advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
return (EAGAIN);
}
+/*
+ * The traverse_callback function will call the function specified in th_func.
+ * In the event of an error the callee, specified by th_func, must return
+ * one of the following errors:
+ *
+ * EINTR - Indicates that the callee wants the traversal to
+ * abort immediately.
+ * ERESTART - The callee has acknowledged the error and would
+ * like to continue.
+ */
static int
traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
{
@@ -603,7 +614,10 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
th->th_locked = 0;
}
- rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+ if (BP_IS_HOLE(&dsp->ds_bp))
+ rc = ERESTART;
+ else
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
if (rc != 0) {
if (rc == ERESTART)
@@ -722,6 +736,24 @@ traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
}
int
+traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
+
+ traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
traverse_more(traverse_handle_t *th)
{
zseg_t *zseg = list_head(&th->th_seglist);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index 13fd8d4d9dce..000c3ce64eb5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
@@ -157,7 +155,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL)
return (EIO);
- err = dbuf_read(db, zio, DB_RF_CANFAIL);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
dbuf_rele(db, FTAG);
return (err);
}
@@ -294,6 +292,8 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
txh->txh_space_tooverwrite += space;
} else {
txh->txh_space_towrite += space;
+ if (dn && dn->dn_dbuf->db_blkptr)
+ txh->txh_space_tounref += space;
}
}
@@ -318,39 +318,25 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
static void
dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
- uint64_t blkid, nblks;
- uint64_t space = 0;
+ uint64_t blkid, nblks, lastblk;
+ uint64_t space = 0, unref = 0, skipped = 0;
dnode_t *dn = txh->txh_dnode;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- int dirty;
+ int epbs;
- /*
- * We don't need to use any locking to check for dirtyness
- * because it's OK if we get stale data -- the dnode may become
- * dirty immediately after our check anyway. This is just a
- * means to avoid the expensive count when we aren't sure we
- * need it. We need to be able to deal with a dirty dnode.
- */
- dirty = list_link_active(&dn->dn_dirty_link[0]) |
- list_link_active(&dn->dn_dirty_link[1]) |
- list_link_active(&dn->dn_dirty_link[2]) |
- list_link_active(&dn->dn_dirty_link[3]);
- if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ if (dn->dn_nlevels == 0)
return;
/*
- * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * The struct_rwlock protects us against dn_nlevels
* changing, in case (against all odds) we manage to dirty &
* sync out the changes after we check for being dirty.
- * also, dbuf_hold_impl() wants us to have the struct_rwlock.
- *
- * It's fine to use dn_datablkshift rather than the dn_phys
- * equivalent because if it is changing, maxblkid==0 and we will
- * bail.
+ * Also, dbuf_hold_level() wants us to have the struct_rwlock.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_phys->dn_maxblkid == 0) {
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_maxblkid == 0) {
if (off == 0 && len >= dn->dn_datablksz) {
blkid = 0;
nblks = 1;
@@ -360,78 +346,120 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
} else {
blkid = off >> dn->dn_datablkshift;
- nblks = (off + len) >> dn->dn_datablkshift;
+ nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_phys->dn_maxblkid) {
+ if (blkid >= dn->dn_maxblkid) {
rw_exit(&dn->dn_struct_rwlock);
return;
}
- if (blkid + nblks > dn->dn_phys->dn_maxblkid)
- nblks = dn->dn_phys->dn_maxblkid - blkid;
+ if (blkid + nblks > dn->dn_maxblkid)
+ nblks = dn->dn_maxblkid - blkid;
- /* don't bother after 128,000 blocks */
- nblks = MIN(nblks, 128*1024);
}
-
- if (dn->dn_phys->dn_nlevels == 1) {
+ if (dn->dn_nlevels == 1) {
int i;
for (i = 0; i < nblks; i++) {
blkptr_t *bp = dn->dn_phys->dn_blkptr;
- ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ ASSERT3U(blkid + i, <, dn->dn_nblkptr);
bp += blkid + i;
if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
space += bp_get_dasize(spa, bp);
}
+ unref += BP_GET_ASIZE(bp);
}
nblks = 0;
}
+ /*
+ * Add in memory requirements of higher-level indirects.
+ * This assumes a worst-possible scenario for dn_nlevels.
+ */
+ {
+ uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
+ int level = (dn->dn_nlevels > 1) ? 2 : 1;
+
+ while (level++ < DN_MAX_LEVELS) {
+ txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
+ blkcnt = 1 + (blkcnt >> epbs);
+ }
+ ASSERT(blkcnt <= dn->dn_nblkptr);
+ }
+
+ lastblk = blkid + nblks - 1;
while (nblks) {
dmu_buf_impl_t *dbuf;
- int err, epbs, blkoff, tochk;
-
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- blkoff = P2PHASE(blkid, 1<<epbs);
- tochk = MIN((1<<epbs) - blkoff, nblks);
-
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
- if (err == 0) {
- int i;
- blkptr_t *bp;
-
- err = dbuf_read(dbuf, NULL,
- DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
- if (err != 0) {
- txh->txh_tx->tx_err = err;
- dbuf_rele(dbuf, FTAG);
- break;
- }
+ uint64_t ibyte, new_blkid;
+ int epb = 1 << epbs;
+ int err, i, blkoff, tochk;
+ blkptr_t *bp;
+
+ ibyte = blkid << dn->dn_datablkshift;
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+ new_blkid = ibyte >> dn->dn_datablkshift;
+ if (err == ESRCH) {
+ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+ break;
+ }
+ if (err) {
+ txh->txh_tx->tx_err = err;
+ break;
+ }
+ if (new_blkid > lastblk) {
+ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+ break;
+ }
- bp = dbuf->db.db_data;
- bp += blkoff;
+ if (new_blkid > blkid) {
+ ASSERT((new_blkid >> epbs) > (blkid >> epbs));
+ skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
+ nblks -= new_blkid - blkid;
+ blkid = new_blkid;
+ }
+ blkoff = P2PHASE(blkid, epb);
+ tochk = MIN(epb - blkoff, nblks);
- for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds,
- bp[i].blk_birth)) {
- dprintf_bp(&bp[i],
- "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
- }
- }
+ dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+
+ txh->txh_memory_tohold += dbuf->db.db_size;
+ if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
+ txh->txh_tx->tx_err = E2BIG;
dbuf_rele(dbuf, FTAG);
+ break;
}
- if (err && err != ENOENT) {
+ err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
break;
}
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+ dprintf_bp(&bp[i], "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ unref += BP_GET_ASIZE(bp);
+ }
+ dbuf_rele(dbuf, FTAG);
+
blkid += tochk;
nblks -= tochk;
}
rw_exit(&dn->dn_struct_rwlock);
+ /* account for new level 1 indirect blocks that might show up */
+ if (skipped > 0) {
+ txh->txh_fudge += skipped << dn->dn_indblkshift;
+ skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+ txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+ }
txh->txh_space_tofree += space;
+ txh->txh_space_tounref += unref;
}
void
@@ -466,7 +494,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
/*
* For i/o error checking, read the first and last level-0
* blocks, and all the level-1 blocks. The above count_write's
- * will take care of the level-0 blocks.
+ * have already taken care of the level-0 blocks.
*/
if (dn->dn_nlevels > 1) {
shift = dn->dn_datablkshift + dn->dn_indblkshift -
@@ -478,7 +506,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
NULL, NULL, ZIO_FLAG_CANFAIL);
for (i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
- err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
if (err == ESRCH)
break;
@@ -550,10 +578,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
* the size will change between now and the dbuf dirty call.
*/
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_phys->dn_blkptr[0].blk_birth))
+ dn->dn_phys->dn_blkptr[0].blk_birth)) {
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
- else
+ } else {
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref +=
+ BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
+ }
return;
}
@@ -575,7 +606,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
* 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
*/
dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
- (3 + add ? 3 : 0) << dn->dn_datablkshift);
+ (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
/*
* If the modified blocks are scattered to the four winds,
@@ -698,12 +729,13 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
match_offset = TRUE;
break;
case THT_FREE:
- if (blkid == beginblk &&
- (txh->txh_arg1 != 0 ||
- dn->dn_maxblkid == 0))
- match_offset = TRUE;
- if (blkid == endblk &&
- txh->txh_arg2 != DMU_OBJECT_END)
+ /*
+ * We will dirty all the level 1 blocks in
+ * the free range and perhaps the first and
+ * last level 0 block.
+ */
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
case THT_BONUS:
@@ -733,12 +765,32 @@ static int
dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
dmu_tx_hold_t *txh;
- uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+ spa_t *spa = tx->tx_pool->dp_spa;
+ uint64_t memory, asize, fsize, usize;
+ uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
ASSERT3U(tx->tx_txg, ==, 0);
+
if (tx->tx_err)
return (tx->tx_err);
+ if (spa_suspended(spa)) {
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ txg_how != TXG_WAIT)
+ return (EIO);
+
+ return (ERESTART);
+ }
+
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;
@@ -748,7 +800,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
* dmu_tx_unassign() logic.
*/
- towrite = tofree = tooverwrite = 0;
+ towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
@@ -768,6 +820,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
towrite += txh->txh_space_towrite;
tofree += txh->txh_space_tofree;
tooverwrite += txh->txh_space_tooverwrite;
+ tounref += txh->txh_space_tounref;
+ tohold += txh->txh_memory_tohold;
+ fudge += txh->txh_fudge;
}
/*
@@ -788,22 +843,31 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
tooverwrite = tofree = 0;
}
- /*
- * Convert logical size to worst-case allocated size.
- */
+ /* needed allocation: worst-case estimate of write space */
+ asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+ /* freed space estimate: worst-case overwrite + free estimate */
fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
- lsize = towrite + tooverwrite;
- asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ /* convert unrefd space to worst-case estimate */
+ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+ /* calculate memory footprint estimate */
+ memory = towrite + tooverwrite + tohold;
#ifdef ZFS_DEBUG
- tx->tx_space_towrite = asize;
+ /*
+ * Add in 'tohold' to account for our dirty holds on this memory
+ * XXX - the "fudge" factor is to account for skipped blocks that
+ * we missed because dnode_next_offset() misses in-core-only blocks.
+ */
+ tx->tx_space_towrite = asize +
+ spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
tx->tx_space_tofree = tofree;
tx->tx_space_tooverwrite = tooverwrite;
+ tx->tx_space_tounref = tounref;
#endif
if (tx->tx_dir && asize != 0) {
- int err = dsl_dir_tempreserve_space(tx->tx_dir,
- lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+ asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
if (err)
return (err);
}
@@ -885,10 +949,18 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
void
dmu_tx_wait(dmu_tx_t *tx)
{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
ASSERT(tx->tx_txg == 0);
- ASSERT(tx->tx_lasttried_txg != 0);
- if (tx->tx_needassign_txh) {
+ /*
+ * It's possible that the pool has become active after this thread
+ * has tried to obtain a tx. If that's the case then his
+ * tx_lasttried_txg would not have been assigned.
+ */
+ if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+ txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
mutex_enter(&dn->dn_mtx);
@@ -948,6 +1020,7 @@ dmu_tx_commit(dmu_tx_t *tx)
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
+ list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
@@ -975,6 +1048,7 @@ dmu_tx_abort(dmu_tx_t *tx)
if (dn != NULL)
dnode_rele(dn, tx);
}
+ list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
refcount_destroy_many(&tx->tx_space_written,
refcount_count(&tx->tx_space_written));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index b25cc898c37d..8dba38176527 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -38,10 +38,6 @@
*/
int zfs_prefetch_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
- &zfs_prefetch_disable, 0, "Disable prefetch");
/* max # of streams per zfetch */
uint32_t zfetch_max_streams = 8;
@@ -52,6 +48,25 @@ uint32_t zfetch_block_cap = 256;
/* number of bytes in a array_read at which we stop prefetching (1Mb) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+ &zfs_prefetch_disable, 0, "Disable prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN,
+ &zfetch_max_streams, 0, "Max # of streams per zfetch");
+TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
+ &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
+TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
+ &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
+TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
+SYSCTL_QUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
+ &zfetch_array_rd_sz, 0,
+ "Number of bytes in a array_read at which we stop prefetching");
+
/* forward decls for static routines */
static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index ca502857b1fa..5adbc3c0ff5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
@@ -242,6 +240,23 @@ free_range_compar(const void *node1, const void *node2)
else return (0);
}
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -285,6 +300,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
list_insert_head(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
+ arc_space_consume(sizeof (dnode_t));
return (dn);
}
@@ -319,6 +335,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_bonus = NULL;
}
kmem_cache_free(dnode_cache, dn);
+ arc_space_return(sizeof (dnode_t));
}
void
@@ -362,6 +379,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -389,6 +407,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -396,7 +415,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int i;
+ int i, old_nblkptr;
dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -413,7 +432,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -436,38 +455,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
}
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
rw_exit(&dn->dn_struct_rwlock);
- if (db) {
+ if (db)
dbuf_rele(db, FTAG);
- db = NULL;
- }
/* change type */
dn->dn_type = ot;
- if (dn->dn_bonuslen != bonuslen) {
- /* change bonus size */
- if (bonuslen == 0)
- bonuslen = 1; /* XXX */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- if (refcount_add(&db->db_holds, FTAG) == 1)
- dnode_add_ref(dn, db);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT(db->db.db_data != NULL);
- db->db.db_size = bonuslen;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- }
-
/* change bonus size and type */
mutex_enter(&dn->dn_mtx);
+ old_nblkptr = dn->dn_nblkptr;
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -475,12 +474,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- /*
- * NB: we have to do the dbuf_rele after we've changed the
- * dn_bonuslen, for the sake of dbuf_verify().
- */
- if (db)
- dbuf_rele(db, FTAG);
+ /* XXX - for now, we can't make nblkptr smaller */
+ ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+ /* fix up the bonus db_size if dn_nblkptr has changed */
+ if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+ dn->dn_bonus->db.db_size =
+ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
@@ -559,6 +561,12 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
dmu_buf_impl_t *db;
dnode_t **children_dnodes;
+ /*
+ * If you are holding the spa config lock as writer, you shouldn't
+ * be asking the DMU to do *anything*.
+ */
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
@@ -602,9 +610,10 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
}
if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
- db, object);
+
+ dn = dnode_create(os, dnp, db, object);
winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
if (winner != NULL) {
dnode_destroy(dn);
@@ -644,11 +653,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
dnode_add_ref(dnode_t *dn, void *tag)
{
- ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, tag);
+ mutex_enter(&dn->dn_mtx);
+ if (refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
}
void
@@ -656,7 +676,9 @@ dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && dn->dn_dbuf)
dbuf_rele(dn->dn_dbuf, dn);
@@ -692,6 +714,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -714,7 +737,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
* dnode will hang around after we finish processing its
* children.
*/
- dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
(void) dbuf_dirty(dn->dn_dbuf, tx);
@@ -762,7 +785,7 @@ int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
- int have_db0 = FALSE;
+ int err;
if (size == 0)
size = SPA_MINBLOCKSIZE;
@@ -787,9 +810,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid == 0) {
- have_db0 = TRUE;
- } else if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -799,12 +820,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs && dn->dn_nlevels != 1)
goto fail;
- db = NULL;
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
- /* obtain the old block */
- db = dbuf_hold(dn, 0, FTAG);
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ if (err == 0)
dbuf_new_size(db, size, tx);
- }
+ else if (err != ENOENT)
+ goto fail;
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
@@ -813,7 +834,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dn->dn_indblkshift = ibs;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
}
-
+ /* rele after we have fixed the blocksize in the dnode */
if (db)
dbuf_rele(db, FTAG);
@@ -825,19 +846,32 @@ fail:
return (ENOTSUP);
}
+/* read-holding callers must not rely on the lock being continuously held */
void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
- int drop_struct_lock = FALSE;
int epbs, new_nlevels;
uint64_t sz;
ASSERT(blkid != DB_BONUS_BLKID);
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- drop_struct_lock = TRUE;
+ ASSERT(have_read ?
+ RW_READ_HELD(&dn->dn_struct_rwlock) :
+ RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * if we have a read-lock, check to see if we need to do any work
+ * before upgrading to a write-lock.
+ */
+ if (have_read) {
+ if (blkid <= dn->dn_maxblkid)
+ return;
+
+ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ }
}
if (blkid <= dn->dn_maxblkid)
@@ -889,8 +923,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
}
out:
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
+ if (have_read)
+ rw_downgrade(&dn->dn_struct_rwlock);
}
void
@@ -951,15 +985,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
uint64_t blkoff, blkid, nblks;
- int blksz, head;
+ int blksz, blkshift, head, tail;
int trunc = FALSE;
+ int epbs;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- /* If the range is past the end of the file, this is a no-op */
- if (off >= blksz * (dn->dn_maxblkid+1))
- goto out;
if (len == -1ULL) {
len = UINT64_MAX - off;
trunc = TRUE;
@@ -971,11 +1005,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (ISP2(blksz)) {
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
- /* Freeing the whole block; don't do any head. */
- head = 0;
+ /* Freeing the whole block; fast-track this request */
+ blkid = 0;
+ nblks = 1;
+ goto done;
+ } else if (off >= blksz) {
+ /* Freeing past end-of-data */
+ goto out;
} else {
/* Freeing part of the block. */
head = blksz - off;
@@ -1008,88 +1049,95 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
/* If the range was less than one block, we're done */
- if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ if (len == 0)
goto out;
- if (!ISP2(blksz)) {
- /*
- * They are freeing the whole block of a
- * non-power-of-two blocksize file. Skip all the messy
- * math.
- */
- ASSERT3U(off, ==, 0);
- ASSERT3U(len, >=, blksz);
- blkid = 0;
- nblks = 1;
- } else {
- int tail;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- int blkshift = dn->dn_datablkshift;
-
- /* If the remaining range is past end of file, we're done */
- if (off > dn->dn_maxblkid << blkshift)
- goto out;
+ /* If the remaining range is past end of file, we're done */
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
- if (off + len == UINT64_MAX)
- tail = 0;
- else
- tail = P2PHASE(len, blksz);
-
- ASSERT3U(P2PHASE(off, blksz), ==, 0);
- /* zero out any partial block data at the end of the range */
- if (tail) {
- if (len < tail)
- tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
- /* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr &&
- !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
- rw_enter(&dn->dn_struct_rwlock,
- RW_WRITER);
- bzero(db->db.db_data, tail);
- }
- dbuf_rele(db, FTAG);
+ ASSERT(ISP2(blksz));
+ if (trunc)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, tail);
}
- len -= tail;
+ dbuf_rele(db, FTAG);
}
- /* If the range did not include a full block, we are done */
- if (len == 0)
- goto out;
+ len -= tail;
+ }
- /* dirty the left indirects */
- if (dn->dn_nlevels > 1 && off != 0) {
- db = dbuf_hold_level(dn, 1,
- (off - head) >> (blkshift + epbs), FTAG);
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ ASSERT(IS_P2ALIGNED(off, blksz));
+ ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+ if (trunc)
+ nblks += 1;
+
+ /*
+ * Read in and mark all the level-1 indirects dirty,
+ * so that they will stay in memory until syncing phase.
+ * Always dirty the first and last indirect to make sure
+ * we dirty all the partial indirects.
+ */
+ if (dn->dn_nlevels > 1) {
+ uint64_t i, first, last;
+ int shift = epbs + dn->dn_datablkshift;
+
+ first = blkid >> epbs;
+ if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
-
- /* dirty the right indirects */
- if (dn->dn_nlevels > 1 && !trunc) {
- db = dbuf_hold_level(dn, 1,
- (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ if (trunc)
+ last = dn->dn_maxblkid >> epbs;
+ else
+ last = (blkid + nblks - 1) >> epbs;
+ if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
-
- /*
- * Finally, add this range to the dnode range list, we
- * will finish up this free operation in the syncing phase.
- */
- ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
- ASSERT(off + len == UINT64_MAX ||
- IS_P2ALIGNED(len, 1<<blkshift));
- blkid = off >> blkshift;
- nblks = len >> blkshift;
-
- if (trunc)
- dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ for (i = first + 1; i < last; i++) {
+ uint64_t ibyte = i << shift;
+ int err;
+
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH || i >= last)
+ break;
+ ASSERT(err == 0);
+ db = dbuf_hold_level(dn, 1, i, FTAG);
+ if (db) {
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+ }
}
-
+done:
+ /*
+ * Add this range to the dnode range list.
+ * We will finish up this free operation in the syncing phase.
+ */
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, blkid, nblks, tx);
{
@@ -1109,9 +1157,12 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
mutex_exit(&dn->dn_mtx);
- dbuf_free_range(dn, blkid, nblks, tx);
+ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
+ if (trunc && dn->dn_maxblkid >= (off >> blkshift))
+ dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
+
rw_exit(&dn->dn_struct_rwlock);
}
@@ -1179,7 +1230,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
ASSERT3U(space, >=, -delta); /* no underflow */
}
space += delta;
- if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
@@ -1211,7 +1262,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
}
static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
@@ -1219,11 +1270,16 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t epb = 1ULL << epbs;
uint64_t minfill, maxfill;
- int i, error, span;
+ boolean_t hole;
+ int i, inc, error, span;
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+ hole = flags & DNODE_FIND_HOLE;
+ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+ ASSERT(txg == 0 || !hole);
+
if (lvl == dn->dn_phys->dn_nlevels) {
error = 0;
epb = dn->dn_phys->dn_nblkptr;
@@ -1232,9 +1288,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
if (error) {
- if (error == ENOENT)
- return (hole ? 0 : ESRCH);
- return (error);
+ if (error != ENOENT)
+ return (error);
+ if (hole)
+ return (0);
+ /*
+ * This can only happen when we are searching up
+ * the block tree for data. We don't really need to
+ * adjust the offset, as we will just end up looking
+ * at the pointer to this block in its parent, and its
+ * going to be unallocated, so we will skip over it.
+ */
+ return (ESRCH);
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
if (error) {
@@ -1246,13 +1311,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
if (db && txg &&
(db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+ /*
+ * This can only happen when we are searching up the tree
+ * and these conditions mean that we need to keep climbing.
+ */
error = ESRCH;
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
span = DNODE_SHIFT;
ASSERT(dn->dn_type == DMU_OT_DNODE);
- for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ for (i = (*offset >> span) & (blkfill - 1);
+ i >= 0 && i < blkfill; i += inc) {
boolean_t newcontents = B_TRUE;
if (txg) {
int j;
@@ -1264,9 +1334,9 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
}
if (!dnp[i].dn_type == hole && newcontents)
break;
- *offset += 1ULL << span;
+ *offset += (1ULL << span) * inc;
}
- if (i == blkfill)
+ if (i < 0 || i == blkfill)
error = ESRCH;
} else {
blkptr_t *bp = data;
@@ -1280,14 +1350,17 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
minfill++;
for (i = (*offset >> span) & ((1ULL << epbs) - 1);
- i < epb; i++) {
+ i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill &&
bp[i].blk_fill <= maxfill &&
- bp[i].blk_birth > txg)
+ (hole || bp[i].blk_birth > txg))
break;
- *offset += 1ULL << span;
+ if (inc < 0 && *offset < (1ULL << span))
+ *offset = 0;
+ else
+ *offset += (1ULL << span) * inc;
}
- if (i >= epb)
+ if (i < 0 || i == epb)
error = ESRCH;
}
@@ -1306,64 +1379,66 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
*
* Examples:
*
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- * Finds the next hole/data in a file.
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ * Finds the next/previous hole/data in a file.
* Used in dmu_offset_next().
*
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
* Finds the next free/allocated dnode an objset's meta-dnode.
* Only finds objects that have new contents since txg (ie.
* bonus buffer changes and content removal are ignored).
* Used in dmu_object_next().
*
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
* Used in dmu_object_alloc().
*/
int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
+ uint64_t initial_offset = *offset;
int lvl, maxlvl;
int error = 0;
- uint64_t initial_offset = *offset;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_phys->dn_nlevels == 0) {
- rw_exit(&dn->dn_struct_rwlock);
- return (ESRCH);
+ error = ESRCH;
+ goto out;
}
if (dn->dn_datablkshift == 0) {
if (*offset < dn->dn_datablksz) {
- if (hole)
+ if (flags & DNODE_FIND_HOLE)
*offset = dn->dn_datablksz;
} else {
error = ESRCH;
}
- rw_exit(&dn->dn_struct_rwlock);
- return (error);
+ goto out;
}
maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
if (error != ESRCH)
break;
}
- while (--lvl >= minlvl && error == 0) {
+ while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
}
- rw_exit(&dn->dn_struct_rwlock);
-
- if (error == 0 && initial_offset > *offset)
+ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+ initial_offset < *offset : initial_offset > *offset))
error = ESRCH;
+out:
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_exit(&dn->dn_struct_rwlock);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 9e8c7adbda01..a46d4e70abc8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,9 +55,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
ASSERT(db != NULL);
dn->dn_phys->dn_nlevels = new_level;
- dprintf("os=%p obj=%llu, increase to %d\n",
- dn->dn_objset, dn->dn_object,
- dn->dn_phys->dn_nlevels);
+ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+ dn->dn_object, dn->dn_phys->dn_nlevels);
/* check for existing blkptrs in the dnode */
for (i = 0; i < nblkptr; i++)
@@ -110,25 +109,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
-static void
+static int
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
- int i;
+ int i, blocks_freed = 0;
- dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
for (i = 0; i < num; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += bp_get_dasize(os->os_spa, bp);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
- dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
bzero(bp, sizeof (blkptr_t));
+ blocks_freed += 1;
}
dnode_diduse_space(dn, -bytesfreed);
+ return (blocks_freed);
}
#ifdef ZFS_DEBUG
@@ -160,7 +160,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
- (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
rw_exit(&db->db_dnode->dn_struct_rwlock);
if (err == ENOENT)
continue;
@@ -178,7 +178,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
if (buf[j] != 0) {
panic("freed data not zero: "
"child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
+ (void *)child, i, off, num);
}
}
}
@@ -195,7 +195,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
if (buf[j] != 0) {
panic("freed data not zero: "
"child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
+ (void *)child, i, off, num);
}
}
}
@@ -206,6 +206,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
+#define ALL -1
+
static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
@@ -216,8 +218,18 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
int all = TRUE;
+ int blocks_freed = 0;
+
+ /*
+ * There is a small possibility that this block will not be cached:
+ * 1 - if level > 1 and there are no children with level <= 1
+ * 2 - if we didn't get a dirty hold (because this block had just
+ * finished being written -- and so had no holds), and then this
+ * block got evicted before we got here.
+ */
+ if (db->db_state != DB_CACHED)
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
arc_release(db->db_buf, db);
bp = (blkptr_t *)db->db.db_data;
@@ -241,10 +253,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
+ blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
for (i = start; i <= end; i++, bp++) {
@@ -255,9 +267,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(subdb->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ blocks_freed += free_blocks(dn, bp, 1, tx);
} else {
all = FALSE;
}
@@ -274,8 +286,8 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
ASSERT3U(bp->blk_birth, ==, 0);
}
#endif
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
/*
@@ -305,15 +317,14 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
- free_blocks(dn, bp + blkid, nblks, tx);
+ (void) free_blocks(dn, bp + blkid, nblks, tx);
if (trunc) {
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off,
- 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
return;
}
@@ -331,9 +342,9 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(db, blkid, nblks, trunc, tx)) {
+ if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(db->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ (void) free_blocks(dn, bp, 1, tx);
}
dbuf_rele(db, FTAG);
}
@@ -343,15 +354,15 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
}
/*
* Try to kick all the dnodes dbufs out of the cache...
*/
-int
-dnode_evict_dbufs(dnode_t *dn, int try)
+void
+dnode_evict_dbufs(dnode_t *dn)
{
int progress;
int pass = 0;
@@ -367,6 +378,7 @@ dnode_evict_dbufs(dnode_t *dn, int try)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
+ ASSERT3P(db->db_dnode, ==, dn);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
@@ -375,7 +387,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
mutex_exit(&db->db_mtx);
} else if (refcount_is_zero(&db->db_holds)) {
progress = TRUE;
- ASSERT(!arc_released(db->db_buf));
dbuf_clear(db); /* exits db_mtx for us */
} else {
mutex_exit(&db->db_mtx);
@@ -397,21 +408,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
ASSERT(pass < 100); /* sanity check */
} while (progress);
- /*
- * This function works fine even if it can't evict everything.
- * If were only asked to try to evict everything then
- * return an error if we can't. Otherwise panic as the caller
- * expects total eviction.
- */
- if (list_head(&dn->dn_dbufs) != NULL) {
- if (try) {
- return (1);
- } else {
- panic("dangling dbufs (dn=%p, dbuf=%p)\n",
- dn, list_head(&dn->dn_dbufs));
- }
- }
-
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +415,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
dn->dn_bonus = NULL;
}
rw_exit(&dn->dn_struct_rwlock);
- return (0);
}
static void
@@ -460,8 +455,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
+ /*
+ * Our contents should have been freed in dnode_sync() by the
+ * free range record inserted by the caller of dnode_free().
+ */
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -479,10 +481,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_indblkshift[txgoff] = 0;
dn->dn_next_blksz[txgoff] = 0;
- /* free up all the blocks in the file. */
- dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
- ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
/* ASSERT(blkptrs are zero); */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -496,6 +494,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_type = DMU_OT_NONE;
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -558,7 +557,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- list_head(list) != NULL ||
+ dn->dn_maxblkid == 0 || list_head(list) != NULL ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -566,6 +565,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_blksz[txgoff] = 0;
}
+ if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -583,20 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
/* process all the "freed" ranges in the file */
- if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
- for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
- rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
- dnode_sync_free_range(dn,
- rp->fr_blkid, rp->fr_nblks, tx);
+ while (rp = avl_last(&dn->dn_ranges[txgoff])) {
+ dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
+ /* grab the mutex so we don't race with dnode_block_freed() */
+ mutex_enter(&dn->dn_mtx);
+ avl_remove(&dn->dn_ranges[txgoff], rp);
+ mutex_exit(&dn->dn_mtx);
+ kmem_free(rp, sizeof (free_range_t));
}
- mutex_enter(&dn->dn_mtx);
- for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
- free_range_t *last = rp;
- rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
- avl_remove(&dn->dn_ranges[txgoff], last);
- kmem_free(last, sizeof (free_range_t));
- }
- mutex_exit(&dn->dn_mtx);
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
dnode_sync_free(dn, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 7d4689f3352a..20d8ec85cc91 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_dir.h>
@@ -38,35 +36,44 @@
#include <sys/unique.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/zfs_znode.h>
+#include <sys/sunddi.h>
+
+static char *dsl_reaper = "the grim reaper";
static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
static dsl_checkfunc_t dsl_dataset_rollback_check;
static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
+static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper)
+
+
/*
- * We use weighted reference counts to express the various forms of exclusion
- * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
- * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
- * This makes the exclusion logic simple: the total refcnt for all opens cannot
- * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
- * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
- * just over half of the refcnt space, so there can't be more than one, but it
- * can peacefully coexist with any number of STANDARD opens.
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer. If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
*/
-static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
- 0, /* DS_MODE_NONE - invalid */
- 1, /* DS_MODE_STANDARD - unlimited number */
- (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
- DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
-};
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+ uint64_t old_bytes, new_bytes;
+ if (ds->ds_reserved == 0)
+ return (delta);
+
+ old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+ return (new_bytes - old_bytes);
+}
void
dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
@@ -74,6 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
+ int64_t delta;
dprintf_bp(bp, "born, ds=%p\n", ds);
@@ -89,23 +97,28 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
* dsl_dir.
*/
ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
used, compressed, uncompressed, tx);
dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
return;
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
+ delta = parent_delta(ds, used);
ds->ds_phys->ds_used_bytes += used;
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- used, compressed, uncompressed, tx);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+ compressed, uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
}
-void
+int
dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx)
{
@@ -113,10 +126,11 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
+ ASSERT(pio != NULL);
ASSERT(dmu_tx_is_syncing(tx));
/* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
- return;
+ return (0);
ASSERT(used > 0);
if (ds == NULL) {
@@ -125,51 +139,59 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
* Account for the meta-objset space in its placeholder
* dataset.
*/
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ err = dsl_free(pio, tx->tx_pool,
+ tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
ASSERT(err == 0);
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-used, -compressed, -uncompressed, tx);
dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
- return;
+ return (used);
}
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+ ASSERT(!dsl_dataset_is_snapshot(ds));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
int err;
+ int64_t delta;
dprintf_bp(bp, "freeing: %s", "");
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ err = dsl_free(pio, tx->tx_pool,
+ tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
ASSERT(err == 0);
+ mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- /* XXX unique_bytes is not accurate for head datasets */
- /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+ ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+ !DS_UNIQUE_IS_ACCURATE(ds));
+ delta = parent_delta(ds, -used);
ds->ds_phys->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ delta, -compressed, -uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
- ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
- ds->ds_object && bp->blk_birth >
- ds->ds_prev->ds_phys->ds_prev_snap_txg) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- mutex_enter(&ds->ds_prev->ds_lock);
- ds->ds_prev->ds_phys->ds_unique_bytes +=
- used;
- mutex_exit(&ds->ds_prev->ds_lock);
- }
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ if (bp->blk_birth > ds->ds_origin_txg) {
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
}
}
mutex_enter(&ds->ds_lock);
@@ -180,6 +202,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
mutex_exit(&ds->ds_lock);
+
+ return (used);
}
uint64_t
@@ -216,32 +240,38 @@ static void
dsl_dataset_evict(dmu_buf_t *db, void *dsv)
{
dsl_dataset_t *ds = dsv;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- /* open_refcount == DS_REF_MAX when deleting */
- ASSERT(ds->ds_open_refcount == 0 ||
- ds->ds_open_refcount == DS_REF_MAX);
+ ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
dprintf_ds(ds, "evicting %s\n", "");
- unique_remove(ds->ds_phys->ds_fsid_guid);
+ unique_remove(ds->ds_fsid_guid);
if (ds->ds_user_ptr != NULL)
ds->ds_user_evict_func(ds, ds->ds_user_ptr);
if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ dsl_dataset_drop_ref(ds->ds_prev, ds);
ds->ds_prev = NULL;
}
bplist_close(&ds->ds_deadlist);
- dsl_dir_close(ds->ds_dir, ds);
+ if (ds->ds_dir)
+ dsl_dir_close(ds->ds_dir, ds);
- if (list_link_active(&ds->ds_synced_link))
- list_remove(&dp->dp_synced_objsets, ds);
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+ if (mutex_owned(&ds->ds_lock))
+ mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
+ if (mutex_owned(&ds->ds_opening_lock))
+ mutex_exit(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ if (mutex_owned(&ds->ds_deadlist.bpl_lock))
+ mutex_exit(&ds->ds_deadlist.bpl_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
}
@@ -266,16 +296,54 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
return (err);
headphys = headdbuf->db_data;
err = zap_value_search(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
dmu_buf_rele(headdbuf, FTAG);
return (err);
}
-int
-dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
- int mode, void *tag, dsl_dataset_t **dsp)
+static int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ matchtype_t mt;
+ int err;
+
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+ value, mt, NULL, 0, NULL);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_lookup(mos, snapobj, name, 8, 1, value);
+ return (err);
+}
+
+static int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ matchtype_t mt;
+ int err;
+
+ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_remove_norm(mos, snapobj, name, mt, tx);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_remove(mos, snapobj, name, tx);
+ return (err);
+}
+
+static int
+dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
@@ -297,8 +365,11 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
ds->ds_phys = dbuf->db_data;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
NULL);
+ rw_init(&ds->ds_rwlock, 0, 0, 0);
+ cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
err = bplist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
@@ -312,42 +383,65 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
* just opened it.
*/
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
}
- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ if (!dsl_dataset_is_snapshot(ds)) {
ds->ds_snapname[0] = '\0';
if (ds->ds_phys->ds_prev_snap_obj) {
- err = dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds, &ds->ds_prev);
+ err = dsl_dataset_get_ref(dp,
+ ds->ds_phys->ds_prev_snap_obj,
+ ds, &ds->ds_prev);
}
- } else {
- if (snapname) {
-#ifdef ZFS_DEBUG
- dsl_dataset_phys_t *headphys;
- dmu_buf_t *headdbuf;
- err = dmu_bonus_hold(mos,
- ds->ds_dir->dd_phys->dd_head_dataset_obj,
- FTAG, &headdbuf);
+
+ if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dataset_t *origin;
+
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj,
+ FTAG, &origin);
if (err == 0) {
- headphys = headdbuf->db_data;
- uint64_t foundobj;
- err = zap_lookup(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj,
- snapname, sizeof (foundobj), 1,
- &foundobj);
- ASSERT3U(foundobj, ==, dsobj);
- dmu_buf_rele(headdbuf, FTAG);
+ ds->ds_origin_txg =
+ origin->ds_phys->ds_creation_txg;
+ dsl_dataset_rele(origin, FTAG);
}
-#endif
- (void) strcat(ds->ds_snapname, snapname);
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
}
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ err = dsl_dataset_get_snapname(ds);
+ }
+
+ if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+ /*
+ * In sync context, we're called with either no lock
+ * or with the write lock. If we're not syncing,
+ * we're always called with the read lock held.
+ */
+ boolean_t need_lock =
+ !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
+ dsl_pool_sync_context(dp);
+
+ if (need_lock)
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_ds(ds,
+ "refreservation", sizeof (uint64_t), 1,
+ &ds->ds_reserved, NULL);
+ if (err == 0) {
+ err = dsl_prop_get_ds(ds,
+ "refquota", sizeof (uint64_t), 1,
+ &ds->ds_quota, NULL);
+ }
+
+ if (need_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ } else {
+ ds->ds_reserved = ds->ds_quota = 0;
}
if (err == 0) {
@@ -356,13 +450,14 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
}
if (err || winner) {
bplist_close(&ds->ds_deadlist);
- if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev,
- DS_MODE_NONE, ds);
- }
+ if (ds->ds_prev)
+ dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ rw_destroy(&ds->ds_rwlock);
+ cv_destroy(&ds->ds_exclusive_cv);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err) {
dmu_buf_rele(dbuf, tag);
@@ -370,101 +465,175 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
}
ds = winner;
} else {
- uint64_t new =
+ ds->ds_fsid_guid =
unique_insert(ds->ds_phys->ds_fsid_guid);
- if (new != ds->ds_phys->ds_fsid_guid) {
- /* XXX it won't necessarily be synced... */
- ds->ds_phys->ds_fsid_guid = new;
- }
}
}
ASSERT3P(ds->ds_dbuf, ==, dbuf);
ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+ spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+ dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
mutex_enter(&ds->ds_lock);
- if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
- (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
- !DS_MODE_IS_INCONSISTENT(mode)) ||
- (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+ if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
mutex_exit(&ds->ds_lock);
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- return (EBUSY);
+ dmu_buf_rele(ds->ds_dbuf, tag);
+ return (ENOENT);
}
- ds->ds_open_refcount += weight;
mutex_exit(&ds->ds_lock);
-
*dsp = ds;
return (0);
}
+static int
+dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /*
+ * In syncing context we don't want the rwlock lock: there
+ * may be an existing writer waiting for sync phase to
+ * finish. We don't need to worry about such writers, since
+ * sync phase is single-threaded, so the writer can't be
+ * doing anything while we are active.
+ */
+ if (dsl_pool_sync_context(dp)) {
+ ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+ return (0);
+ }
+
+ /*
+ * Normal users will hold the ds_rwlock as a READER until they
+ * are finished (i.e., call dsl_dataset_rele()). "Owners" will
+ * drop their READER lock after they set the ds_owner field.
+ *
+ * If the dataset is being destroyed, the destroy thread will
+ * obtain a WRITER lock for exclusive access after it's done its
+ * open-context work and then change the ds_owner to
+ * dsl_reaper once destruction is assured. So threads
+ * may block here temporarily, until the "destructability" of
+ * the dataset is determined.
+ */
+ ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
+ mutex_enter(&ds->ds_lock);
+ while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
+ rw_exit(&dp->dp_config_rwlock);
+ cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
+ if (DSL_DATASET_IS_DESTROYED(ds)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_drop_ref(ds, tag);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ return (ENOENT);
+ }
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ }
+ mutex_exit(&ds->ds_lock);
+ return (0);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
+
+ if (err)
+ return (err);
+ return (dsl_dataset_hold_ref(*dsp, tag));
+}
+
int
-dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
- void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
+ dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
+
+ ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
+
+ if (err)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EBUSY);
+ }
+ return (0);
+}
+
+int
+dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
dsl_pool_t *dp;
- const char *tail;
+ const char *snapname;
uint64_t obj;
- dsl_dataset_t *ds = NULL;
int err = 0;
- err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
if (err)
return (err);
dp = dd->dd_pool;
obj = dd->dd_phys->dd_head_dataset_obj;
rw_enter(&dp->dp_config_rwlock, RW_READER);
- if (obj == 0) {
- /* A dataset with no associated objset */
+ if (obj)
+ err = dsl_dataset_get_ref(dp, obj, tag, dsp);
+ else
err = ENOENT;
+ if (err)
goto out;
- }
- if (tail != NULL) {
- objset_t *mos = dp->dp_meta_objset;
+ err = dsl_dataset_hold_ref(*dsp, tag);
- err = dsl_dataset_open_obj(dp, obj, NULL,
- DS_MODE_NONE, tag, &ds);
- if (err)
- goto out;
- obj = ds->ds_phys->ds_snapnames_zapobj;
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- ds = NULL;
+ /* we may be looking for a snapshot */
+ if (err == 0 && snapname != NULL) {
+ dsl_dataset_t *ds = NULL;
- if (tail[0] != '@') {
+ if (*snapname++ != '@') {
+ dsl_dataset_rele(*dsp, tag);
err = ENOENT;
goto out;
}
- tail++;
- /* Look for a snapshot */
- if (!DS_MODE_IS_READONLY(mode)) {
- err = EROFS;
- goto out;
+ dprintf("looking for snapshot '%s'\n", snapname);
+ err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+ if (err == 0)
+ err = dsl_dataset_get_ref(dp, obj, tag, &ds);
+ dsl_dataset_rele(*dsp, tag);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+
+ if (ds) {
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_snapname[0] == 0)
+ (void) strlcpy(ds->ds_snapname, snapname,
+ sizeof (ds->ds_snapname));
+ mutex_exit(&ds->ds_lock);
+ err = dsl_dataset_hold_ref(ds, tag);
+ *dsp = err ? NULL : ds;
}
- dprintf("looking for snapshot '%s'\n", tail);
- err = zap_lookup(mos, obj, tail, 8, 1, &obj);
- if (err)
- goto out;
}
- err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
-
out:
rw_exit(&dp->dp_config_rwlock);
dsl_dir_close(dd, FTAG);
-
- ASSERT3U((err == 0), ==, (ds != NULL));
- /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
-
- *dsp = ds;
return (err);
}
int
-dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
{
- return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+ int err = dsl_dataset_hold(name, owner, dsp);
+ if (err)
+ return (err);
+ if ((*dsp)->ds_phys->ds_num_children > 0 &&
+ !DS_MODE_IS_READONLY(flags)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EROFS);
+ }
+ if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+ dsl_dataset_rele(*dsp, owner);
+ return (EBUSY);
+ }
+ return (0);
}
void
@@ -477,11 +646,11 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
VERIFY(0 == dsl_dataset_get_snapname(ds));
if (ds->ds_snapname[0]) {
(void) strcat(name, "@");
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
if (!MUTEX_HELD(&ds->ds_lock)) {
- /*
- * We use a "recursive" mutex so that we
- * can call dprintf_ds() with ds_lock held.
- */
mutex_enter(&ds->ds_lock);
(void) strcat(name, ds->ds_snapname);
mutex_exit(&ds->ds_lock);
@@ -505,7 +674,6 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
if (ds->ds_snapname[0]) {
++result; /* adding one for the @-sign */
if (!MUTEX_HELD(&ds->ds_lock)) {
- /* see dsl_datset_name */
mutex_enter(&ds->ds_lock);
result += strlen(ds->ds_snapname);
mutex_exit(&ds->ds_lock);
@@ -519,119 +687,160 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}
void
-dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
+ rw_exit(&ds->ds_rwlock);
+ }
+ dsl_dataset_drop_ref(ds, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+{
+ ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+ (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+
mutex_enter(&ds->ds_lock);
- ASSERT3U(ds->ds_open_refcount, >=, weight);
- ds->ds_open_refcount -= weight;
- dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
- mode, ds->ds_open_refcount);
+ ds->ds_owner = NULL;
+ if (RW_WRITE_HELD(&ds->ds_rwlock)) {
+ rw_exit(&ds->ds_rwlock);
+ cv_broadcast(&ds->ds_exclusive_cv);
+ }
mutex_exit(&ds->ds_lock);
+ if (ds->ds_dbuf)
+ dsl_dataset_drop_ref(ds, owner);
+ else
+ dsl_dataset_evict(ds->ds_dbuf, ds);
+}
- dmu_buf_rele(ds->ds_dbuf, tag);
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+{
+ boolean_t gotit = FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_owner == NULL &&
+ (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+ ds->ds_owner = owner;
+ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
+ rw_exit(&ds->ds_rwlock);
+ gotit = TRUE;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (gotit);
}
void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
{
- objset_t *mos = dp->dp_meta_objset;
+ ASSERT3P(owner, ==, ds->ds_owner);
+ if (!RW_WRITE_HELD(&ds->ds_rwlock))
+ rw_enter(&ds->ds_rwlock, RW_WRITER);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
- dsl_dataset_t *ds;
uint64_t dsobj;
- dsl_dir_t *dd;
+ objset_t *mos = dp->dp_meta_objset;
- dsl_dir_create_root(mos, ddobjp, tx);
- VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+ if (origin == NULL)
+ origin = dp->dp_origin_snap;
+
+ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+ ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_flags = flags;
dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
dsphys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+
+ if (origin) {
+ dsphys->ds_prev_snap_obj = origin->ds_object;
+ dsphys->ds_prev_snap_txg =
+ origin->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ origin->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ origin->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ origin->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = origin->ds_phys->ds_bp;
+ dsphys->ds_flags |= origin->ds_phys->ds_flags;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ origin->ds_phys->ds_num_children++;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+ if (origin->ds_phys->ds_next_clones_obj == 0) {
+ origin->ds_phys->ds_next_clones_obj =
+ zap_create(mos,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY(0 == zap_add_int(mos,
+ origin->ds_phys->ds_next_clones_obj,
+ dsobj, tx));
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_origin_obj = origin->ds_object;
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_head_dataset_obj = dsobj;
- dsl_dir_close(dd, FTAG);
- VERIFY(0 ==
- dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
- (void) dmu_objset_create_impl(dp->dp_spa, ds,
- &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (dsobj);
}
uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
- const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
- dmu_buf_t *dbuf;
- dsl_dataset_phys_t *dsphys;
uint64_t dsobj, ddobj;
- objset_t *mos = dp->dp_meta_objset;
dsl_dir_t *dd;
- ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
- ASSERT(clone_parent == NULL ||
- clone_parent->ds_phys->ds_num_children > 0);
ASSERT(lastname[0] != '@');
- ASSERT(dmu_tx_is_syncing(tx));
- ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+ ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
- dsphys->ds_dir_obj = dd->dd_object;
- dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
- sizeof (dsphys->ds_guid));
- dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
- dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- if (clone_parent) {
- dsphys->ds_prev_snap_obj = clone_parent->ds_object;
- dsphys->ds_prev_snap_txg =
- clone_parent->ds_phys->ds_creation_txg;
- dsphys->ds_used_bytes =
- clone_parent->ds_phys->ds_used_bytes;
- dsphys->ds_compressed_bytes =
- clone_parent->ds_phys->ds_compressed_bytes;
- dsphys->ds_uncompressed_bytes =
- clone_parent->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+ dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
- dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
- clone_parent->ds_phys->ds_num_children++;
+ dsl_deleg_set_create_perms(dd, tx, cr);
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
- }
- dmu_buf_rele(dbuf, FTAG);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_head_dataset_obj = dsobj;
dsl_dir_close(dd, FTAG);
return (dsobj);
@@ -653,21 +862,24 @@ dsl_snapshot_destroy_one(char *name, void *arg)
(void) strcat(name, "@");
(void) strcat(name, da->snapname);
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
da->dstg, &ds);
cp = strchr(name, '@');
*cp = '\0';
- if (err == ENOENT)
- return (0);
- if (err) {
+ if (err == 0) {
+ dsl_dataset_make_exclusive(ds, da->dstg);
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+ dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ } else if (err == ENOENT) {
+ err = 0;
+ } else {
(void) strcpy(da->failed, name);
- return (err);
}
-
- dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
- return (0);
+ return (err);
}
/*
@@ -681,16 +893,8 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
struct destroyarg da;
dsl_sync_task_t *dst;
spa_t *spa;
- char *cp;
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
+ err = spa_open(fsname, &spa, FTAG);
if (err)
return (err);
da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
@@ -706,17 +910,14 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
for (dst = list_head(&da.dstg->dstg_tasks); dst;
dst = list_next(&da.dstg->dstg_tasks, dst)) {
dsl_dataset_t *ds = dst->dst_arg1;
+ /*
+ * Return the file system name that triggered the error
+ */
if (dst->dst_err) {
dsl_dataset_name(ds, fsname);
- cp = strchr(fsname, '@');
- *cp = '\0';
+ *strchr(fsname, '@') = '\0';
}
- /*
- * If it was successful, destroy_sync would have
- * closed the ds
- */
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
+ dsl_dataset_disown(ds, da.dstg);
}
dsl_sync_task_group_destroy(da.dstg);
@@ -724,36 +925,33 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
return (err);
}
+/*
+ * ds must be opened as OWNER. On return (whether successful or not),
+ * ds will be closed and caller can no longer dereference it.
+ */
int
-dsl_dataset_destroy(const char *name)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
{
int err;
dsl_sync_task_group_t *dstg;
objset_t *os;
- dsl_dataset_t *ds;
dsl_dir_t *dd;
uint64_t obj;
- if (strchr(name, '@')) {
+ if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err)
- return (err);
+ dsl_dataset_make_exclusive(ds, tag);
+
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, FTAG, 0);
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
+ ds, tag, 0);
+ goto out;
}
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err)
- return (err);
- ds = os->os->os_dsl_dataset;
dd = ds->ds_dir;
/*
@@ -762,10 +960,12 @@ dsl_dataset_destroy(const char *name)
*/
err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
dsl_dataset_destroy_begin_sync, ds, NULL, 0);
- if (err) {
- dmu_objset_close(os);
- return (err);
- }
+ if (err)
+ goto out;
+
+ err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ if (err)
+ goto out;
/*
* remove the objects in open context, so that we won't
@@ -773,66 +973,73 @@ dsl_dataset_destroy(const char *name)
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
ds->ds_phys->ds_prev_snap_txg)) {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- /*
- * Perhaps there is not enough disk
- * space. Just deal with it from
- * dsl_dataset_destroy_sync().
- */
- dmu_tx_abort(tx);
- continue;
- }
- VERIFY(0 == dmu_object_free(os, obj, tx));
- dmu_tx_commit(tx);
+ /*
+ * Ignore errors, if there is not enough disk space
+ * we will deal with it in dsl_dataset_destroy_sync().
+ */
+ (void) dmu_free_object(os, obj);
}
- /* Make sure it's not dirty before we finish destroying it. */
- txg_wait_synced(dd->dd_pool, 0);
dmu_objset_close(os);
if (err != ESRCH)
- return (err);
+ goto out;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
if (err)
- return (err);
+ goto out;
- err = dsl_dir_open(name, FTAG, &dd, NULL);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
+ if (ds->ds_user_ptr) {
+ /*
+ * We need to sync out all in-flight IO before we try
+ * to evict (the dataset evict func is trying to clear
+ * the cached entries for this dataset in the ARC).
+ */
+ txg_wait_synced(dd->dd_pool, 0);
}
/*
* Blow away the dsl_dir + head dataset.
*/
+ dsl_dataset_make_exclusive(ds, tag);
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, FTAG, 0);
+ dsl_dataset_destroy_sync, ds, tag, 0);
dsl_sync_task_create(dstg, dsl_dir_destroy_check,
dsl_dir_destroy_sync, dd, FTAG, 0);
err = dsl_sync_task_group_wait(dstg);
dsl_sync_task_group_destroy(dstg);
- /* if it is successful, *destroy_sync will close the ds+dd */
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ /* if it is successful, dsl_dir_destroy_sync will close the dd */
+ if (err)
dsl_dir_close(dd, FTAG);
- }
+out:
+ dsl_dataset_disown(ds, tag);
return (err);
}
int
-dsl_dataset_rollback(dsl_dataset_t *ds)
+dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
{
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
- return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+ int err;
+
+ ASSERT(ds->ds_owner);
+
+ dsl_dataset_make_exclusive(ds, ds->ds_owner);
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
- ds, NULL, 0));
+ ds, &ost, 0);
+ /* drop exclusive access */
+ mutex_enter(&ds->ds_lock);
+ rw_exit(&ds->ds_rwlock);
+ cv_broadcast(&ds->ds_exclusive_cv);
+ mutex_exit(&ds->ds_lock);
+ return (err);
}
void *
@@ -904,14 +1111,56 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
}
}
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use. To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+static void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+ uint64_t mrs_used;
+ uint64_t dlused, dlcomp, dluncomp;
+
+ ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0)
+ mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+ else
+ mrs_used = 0;
+
+ VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
+ &dluncomp));
+
+ ASSERT3U(dlused, <=, mrs_used);
+ ds->ds_phys->ds_unique_bytes =
+ ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+
+ if (!DS_UNIQUE_IS_ACCURATE(ds) &&
+ spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+static uint64_t
+dsl_dataset_unique(dsl_dataset_t *ds)
+{
+ if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+
+ return (ds->ds_phys->ds_unique_bytes);
+}
+
struct killarg {
- uint64_t *usedp;
- uint64_t *compressedp;
- uint64_t *uncompressedp;
+ dsl_dataset_t *ds;
zio_t *zio;
dmu_tx_t *tx;
};
+/* ARGSUSED */
static int
kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
{
@@ -920,16 +1169,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
ASSERT3U(bc->bc_errno, ==, 0);
- /*
- * Since this callback is not called concurrently, no lock is
- * needed on the accounting values.
- */
- *ka->usedp += bp_get_dasize(spa, bp);
- *ka->compressedp += BP_GET_PSIZE(bp);
- *ka->uncompressedp += BP_GET_UCSIZE(bp);
- /* XXX check for EIO? */
- (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
- ARC_NOWAIT);
+ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+ (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+
return (0);
}
@@ -938,14 +1180,12 @@ static int
dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
/*
- * There must be a previous snapshot. I suppose we could roll
- * it back to being empty (and re-initialize the upper (ZPL)
- * layer). But for now there's no way to do this via the user
- * interface.
+ * We can only roll back to emptyness if it is a ZPL objset.
*/
- if (ds->ds_phys->ds_prev_snap_txg == 0)
+ if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
return (EINVAL);
/*
@@ -966,13 +1206,44 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ /*
+ * Before the roll back destroy the zil.
+ */
+ if (ds->ds_user_ptr != NULL) {
+ zil_rollback_destroy(
+ ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
+
+ /*
+ * We need to make sure that the objset_impl_t is reopened after
+ * we do the rollback, otherwise it will have the wrong
+ * objset_phys_t. Normally this would happen when this
+ * dataset-open is closed, thus causing the
+ * dataset to be immediately evicted. But when doing "zfs recv
+ * -F", we reopen the objset before that, so that there is no
+ * window where the dataset is closed and inconsistent.
+ */
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+
+ /* Transfer space that was freed since last snap back to the head. */
+ {
+ uint64_t used;
+
+ VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
+ ds->ds_origin_txg, UINT64_MAX, &used));
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_SNAP, DD_USED_HEAD, tx);
+ }
+
/* Zero out the deadlist. */
bplist_close(&ds->ds_deadlist);
bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
@@ -984,39 +1255,65 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
/* Free blkptrs that we gave birth to */
zio_t *zio;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
struct killarg ka;
zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
ZIO_FLAG_MUSTSUCCEED);
- ka.usedp = &used;
- ka.compressedp = &compressed;
- ka.uncompressedp = &uncompressed;
+ ka.ds = ds;
ka.zio = zio;
ka.tx = tx;
(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
ADVANCE_POST, kill_blkptr, &ka);
(void) zio_wait(zio);
-
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
}
- /* Change our contents to that of the prev snapshot */
- ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
- ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
- ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
- ds->ds_phys->ds_compressed_bytes =
- ds->ds_prev->ds_phys->ds_compressed_bytes;
- ds->ds_phys->ds_uncompressed_bytes =
- ds->ds_prev->ds_phys->ds_uncompressed_bytes;
- ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
- ds->ds_phys->ds_unique_bytes = 0;
+ ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
+ ds->ds_phys->ds_unique_bytes == 0);
+
+ if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
+ /* Change our contents to that of the prev snapshot */
+
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
+ ds->ds_prev->ds_phys->ds_used_bytes);
+
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes =
+ ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ }
+ } else {
+ objset_impl_t *osi;
+
+ ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
+ ds->ds_phys->ds_flags = 0;
+ ds->ds_phys->ds_unique_bytes = 0;
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+ &ds->ds_phys->ds_bp, *ost, tx);
+#ifdef _KERNEL
+ zfs_create_fs(&osi->os, kcred, NULL, tx);
+#endif
}
+
+ spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", ds->ds_object);
}
/* ARGSUSED */
@@ -1024,6 +1321,9 @@ static int
dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t count;
+ int err;
/*
* Can't delete a head dataset if there are snapshots of it.
@@ -1034,26 +1334,44 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
return (EINVAL);
+ /*
+ * This is really a dsl_dir thing, but check it here so that
+ * we'll be less likely to leave this dataset inconsistent &
+ * nearly destroyed.
+ */
+ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
return (0);
}
/* ARGSUSED */
static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
/* Mark it as inconsistent on-disk, in case we crash */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
}
/* ARGSUSED */
-static int
+int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
+ /* we have an owner hold, so noone else can destroy us */
+ ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+
/* Can't delete a branch point. */
if (ds->ds_phys->ds_num_children > 1)
return (EEXIST);
@@ -1078,11 +1396,50 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
+struct refsarg {
+ kmutex_t lock;
+ boolean_t gone;
+ kcondvar_t cv;
+};
+
+/* ARGSUSED */
+static void
+dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
+{
+ struct refsarg *arg = argv;
+
+ mutex_enter(&arg->lock);
+ arg->gone = TRUE;
+ cv_signal(&arg->cv);
+ mutex_exit(&arg->lock);
+}
+
static void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
+{
+ struct refsarg arg;
+
+ mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
+ arg.gone = FALSE;
+ (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
+ dsl_dataset_refs_gone);
+ dmu_buf_rele(ds->ds_dbuf, tag);
+ mutex_enter(&arg.lock);
+ while (!arg.gone)
+ cv_wait(&arg.cv, &arg.lock);
+ ASSERT(arg.gone);
+ mutex_exit(&arg.lock);
+ ds->ds_dbuf = NULL;
+ ds->ds_phys = NULL;
+ mutex_destroy(&arg.lock);
+ cv_destroy(&arg.cv);
+}
+
+void
+dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
zio_t *zio;
int err;
int after_branch_point = FALSE;
@@ -1091,29 +1448,53 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
dsl_dataset_t *ds_prev = NULL;
uint64_t obj;
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ ASSERT(ds->ds_owner);
ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ /* signal any waiters that this dataset is going away */
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = dsl_reaper;
+ cv_broadcast(&ds->ds_exclusive_cv);
+ mutex_exit(&ds->ds_lock);
+
+ /* Remove our reservation */
+ if (ds->ds_reserved != 0) {
+ uint64_t val = 0;
+ dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+ ASSERT3U(ds->ds_reserved, ==, 0);
+ }
+
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ dsl_pool_ds_destroyed(ds, tx);
+
obj = ds->ds_object;
if (ds->ds_phys->ds_prev_snap_obj != 0) {
if (ds->ds_prev) {
ds_prev = ds->ds_prev;
} else {
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_prev));
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
}
after_branch_point =
(ds_prev->ds_phys->ds_next_snap_obj != obj);
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point &&
+ ds_prev->ds_phys->ds_next_clones_obj != 0) {
+ VERIFY(0 == zap_remove_int(mos,
+ ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(0 == zap_add_int(mos,
+ ds_prev->ds_phys->ds_next_clones_obj,
+ ds->ds_phys->ds_next_snap_obj, tx));
+ }
+ }
+ if (after_branch_point &&
ds->ds_phys->ds_next_snap_obj == 0) {
/* This clone is toast. */
ASSERT(ds_prev->ds_phys->ds_num_children > 1);
@@ -1130,14 +1511,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
blkptr_t bp;
dsl_dataset_t *ds_next;
uint64_t itor = 0;
+ uint64_t old_unique;
+ int64_t used = 0, compressed = 0, uncompressed = 0;
- spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_next));
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+ old_unique = dsl_dataset_unique(ds_next);
+
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
ds_next->ds_phys->ds_prev_snap_obj =
ds->ds_phys->ds_prev_snap_obj;
@@ -1154,8 +1536,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
*
* XXX we're doing this long task with the config lock held
*/
- while (bplist_iterate(&ds_next->ds_deadlist, &itor,
- &bp) == 0) {
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
&bp, tx));
@@ -1170,16 +1551,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
compressed += BP_GET_PSIZE(&bp);
uncompressed += BP_GET_UCSIZE(&bp);
/* XXX check return value? */
- (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ (void) dsl_free(zio, dp, tx->tx_txg,
&bp, NULL, NULL, ARC_NOWAIT);
}
}
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -compressed, -uncompressed, tx);
+
/* free next's deadlist */
bplist_close(&ds_next->ds_deadlist);
bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
/* set next's deadlist to our deadlist */
+ bplist_close(&ds->ds_deadlist);
ds_next->ds_phys->ds_deadlist_obj =
ds->ds_phys->ds_deadlist_obj;
VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
@@ -1200,51 +1588,50 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
* config lock held
*/
dsl_dataset_t *ds_after_next;
+ uint64_t space;
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds_next->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_after_next));
- itor = 0;
- while (bplist_iterate(&ds_after_next->ds_deadlist,
- &itor, &bp) == 0) {
- if (bp.blk_birth >
- ds->ds_phys->ds_prev_snap_txg &&
- bp.blk_birth <=
- ds->ds_phys->ds_creation_txg) {
- ds_next->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- }
+ VERIFY(0 == dsl_dataset_hold_obj(dp,
+ ds_next->ds_phys->ds_next_snap_obj,
+ FTAG, &ds_after_next));
+
+ VERIFY(0 ==
+ bpli